diff --git a/adapter_config.json b/adapter_config.json index 5f25f5a5ee344124c6acb2dc7e557d94323e30ce..092fee3278e3444f43369802f65b72c8a1e4b2b3 100644 --- a/adapter_config.json +++ b/adapter_config.json @@ -14,13 +14,13 @@ "r": 64, "revision": null, "target_modules": [ - "k_proj", - "down_proj", - "up_proj", - "o_proj", "v_proj", + "down_proj", + "q_proj", "gate_proj", - "q_proj" + "o_proj", + "up_proj", + "k_proj" ], "task_type": "CAUSAL_LM" } \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin index 57a2d3356e3f022ff52340f79e5bf6d1361afdf7..d84c7171eb208fd34875365c634b8187ad2be92d 100644 --- a/adapter_model.bin +++ b/adapter_model.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5785e99f09d371ee2dfcf2a1379bc4856ae0eff4a8f0f90219dd0ddae67439e6 +oid sha256:8c7b7177ebf043527303efc179b203f643a96dcbc5d00d10d809a5b270f2b361 size 871609293 diff --git a/all_results.json b/all_results.json index 11ca757ab27ee34349bd14c1be7a306b90ccf76a..5371452557e9f2fa06e06b3869ab90411a050b1a 100644 --- a/all_results.json +++ b/all_results.json @@ -1,11 +1,11 @@ { - "epoch": 0.03, - "eval_loss": 6.423073768615723, - "eval_runtime": 22.3351, - "eval_samples_per_second": 2.239, - "eval_steps_per_second": 1.119, - "train_loss": 1.5118552861401908, - "train_runtime": 2842.6735, - "train_samples_per_second": 10.553, - "train_steps_per_second": 10.553 + "epoch": 0.04, + "eval_loss": 6.335043907165527, + "eval_runtime": 21.5795, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 1.159, + "train_loss": 1.2399261393149694, + "train_runtime": 2886.6567, + "train_samples_per_second": 10.393, + "train_steps_per_second": 10.393 } \ No newline at end of file diff --git a/checkpoint-2000/adapter_model.bin b/checkpoint-2000/adapter_model.bin deleted file mode 100644 index 21fcecf4036cafc10e52f2215417aad4fd4776d8..0000000000000000000000000000000000000000 --- a/checkpoint-2000/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca7fae775be00cb7e472a75158d8ab72644b8579a83e290f3c7c2b2bc675e8dc -size 871609293 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt deleted file mode 100644 index d86477d49c63b0dae6543911ef91aaf9e71e57eb..0000000000000000000000000000000000000000 --- a/checkpoint-2000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:642bf4673b8142a63843a17e79e1f902522cc016a4c4d10c09a6183594d7f16a -size 873873439 diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth deleted file mode 100644 index 0a5870e8d6b489c6f76bf6384285c4803cd8bf79..0000000000000000000000000000000000000000 --- a/checkpoint-2000/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b145ff169e72f737f06951054bbd14e171358b3bb7a8261a702439d3bfaed29b -size 14511 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt deleted file mode 100644 index 15b464792a2939ac27b073a613d71dff1b74bc7a..0000000000000000000000000000000000000000 --- a/checkpoint-2000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37938855094cecb443a20382ff199f051b8482dee28568306481bf327258d4cf -size 627 diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json deleted file mode 100644 index 436f11b5d288d39215a87cfe9c1b7ebfeefac195..0000000000000000000000000000000000000000 --- a/checkpoint-2000/trainer_state.json +++ /dev/null @@ -1,12196 +0,0 @@ -{ - "best_metric": 6.617897987365723, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-1600", - "epoch": 0.015277671682835536, - "global_step": 2000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 3.382883829861581e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin deleted file mode 100644 index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000 --- a/checkpoint-2000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f -size 6011 diff --git a/checkpoint-2200/README.md b/checkpoint-2200/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-2200/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-2200/adapter_config.json b/checkpoint-2200/adapter_config.json deleted file mode 100644 index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000 --- a/checkpoint-2200/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "down_proj", - "up_proj", - "q_proj", - "gate_proj", - "o_proj", - "v_proj", - "k_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-2200/adapter_model.bin b/checkpoint-2200/adapter_model.bin deleted file mode 100644 index 17a5d9a4024f623f507a7c923ee385b59403ab9b..0000000000000000000000000000000000000000 --- a/checkpoint-2200/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18f11fbc4708b106870eec7154c2b9bbcad7ba5b185b2bacd1b7a7c4926deed7 -size 871609293 diff --git a/checkpoint-2200/adapter_model/adapter_model/README.md b/checkpoint-2200/adapter_model/adapter_model/README.md deleted file mode 100644 index 5f53b1d1fb6c73b71b73ea36af61fcd504b1117e..0000000000000000000000000000000000000000 --- a/checkpoint-2200/adapter_model/adapter_model/README.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - -- PEFT 0.4.0 -- PEFT 0.4.0 -- PEFT 0.4.0 -- PEFT 0.4.0 -- PEFT 0.4.0 - -- PEFT 0.4.0 diff --git a/checkpoint-2200/adapter_model/adapter_model/adapter_config.json b/checkpoint-2200/adapter_model/adapter_model/adapter_config.json deleted file mode 100644 index 2adcd7d22e9c842efe5230fdbfc7ae7a84aededb..0000000000000000000000000000000000000000 --- a/checkpoint-2200/adapter_model/adapter_model/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "q_proj", - "o_proj", - "k_proj", - "gate_proj", - "down_proj", - "v_proj", - "up_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-2200/adapter_model/adapter_model/adapter_model.bin b/checkpoint-2200/adapter_model/adapter_model/adapter_model.bin deleted file mode 100644 index a07c42932c8213b6199c8b6020b7690682ce65df..0000000000000000000000000000000000000000 --- a/checkpoint-2200/adapter_model/adapter_model/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8300649a3cb3257506bd84a299764cdbbadb65ebf8c06576deb99c0b813044d3 -size 871609293 diff --git a/checkpoint-2200/added_tokens.json b/checkpoint-2200/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-2200/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-2200/optimizer.pt b/checkpoint-2200/optimizer.pt deleted file mode 100644 index 11b6afbf4f438e51719ce846f06f76983967b2a3..0000000000000000000000000000000000000000 --- a/checkpoint-2200/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:782517fcce8ab1d24acc99ecafbd35461a60b67479b118a71921b13139c6347d -size 873873439 diff --git a/checkpoint-2200/rng_state.pth b/checkpoint-2200/rng_state.pth deleted file mode 100644 index 4317191df53cab90a68876e74601eb1c4f340eff..0000000000000000000000000000000000000000 --- a/checkpoint-2200/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f1fc4b7401fd0efa3a43831682cea6e692f653e982f2004f30e952d611992a90 -size 14511 diff --git a/checkpoint-2200/scheduler.pt b/checkpoint-2200/scheduler.pt deleted file mode 100644 index d856a76a2f6f02ab5c9f40c3613e6b6e9c558448..0000000000000000000000000000000000000000 --- a/checkpoint-2200/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:092acf798ddb85605a10300c434ed87828e2ee5daceee07a115b3c1278ee199e -size 627 diff --git a/checkpoint-2200/special_tokens_map.json b/checkpoint-2200/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-2200/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-2200/tokenizer.model b/checkpoint-2200/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-2200/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-2200/tokenizer_config.json b/checkpoint-2200/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-2200/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-2200/trainer_state.json b/checkpoint-2200/trainer_state.json deleted file mode 100644 index fff5ed6bd5c0582662fbe46742e7f7f74b40bf04..0000000000000000000000000000000000000000 --- a/checkpoint-2200/trainer_state.json +++ /dev/null @@ -1,13414 +0,0 @@ -{ - "best_metric": 6.580160140991211, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-2200", - "epoch": 0.01680543885111909, - "global_step": 2200, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8099, - "step": 2001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0567, - "step": 2002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2981, - "step": 2003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 2004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.139, - "step": 2005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.903, - "step": 2006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2182, - "step": 2007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2347, - "step": 2008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8383, - "step": 2009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0211, - "step": 2010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2572, - "step": 2011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2877, - "step": 2012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3577, - "step": 2013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2022, - "step": 2014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2722, - "step": 2015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0552, - "step": 2016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9857, - "step": 2017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0519, - "step": 2018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7118, - "step": 2019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4465, - "step": 2020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3009, - "step": 2021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3614, - "step": 2022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3493, - "step": 2023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 2024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0416, - "step": 2025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.686, - "step": 2026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6021, - "step": 2027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4161, - "step": 2028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0029, - "step": 2029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8579, - "step": 2030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0247, - "step": 2031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4184, - "step": 2032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4962, - "step": 2033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5137, - "step": 2034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6692, - "step": 2035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7161, - "step": 2036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.617, - "step": 2037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.413, - "step": 2038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3056, - "step": 2039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9441, - "step": 2040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9943, - "step": 2041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5703, - "step": 2042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1881, - "step": 2043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5763, - "step": 2044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6389, - "step": 2045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1717, - "step": 2046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5482, - "step": 2047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9469, - "step": 2048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7685, - "step": 2049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1381, - "step": 2050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6961, - "step": 2051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6425, - "step": 2052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5354, - "step": 2053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2404, - "step": 2054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1556, - "step": 2055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7133, - "step": 2056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8166, - "step": 2057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 2058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5429, - "step": 2059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0367, - "step": 2060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5386, - "step": 2061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5899, - "step": 2062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 2063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9951, - "step": 2064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8608, - "step": 2065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4735, - "step": 2066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5612, - "step": 2067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7461, - "step": 2068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5887, - "step": 2069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 2070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5589, - "step": 2071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.498, - "step": 2072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1306, - "step": 2073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3492, - "step": 2074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2311, - "step": 2075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8798, - "step": 2076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6799, - "step": 2077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5011, - "step": 2078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8892, - "step": 2079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6449, - "step": 2080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9117, - "step": 2081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1157, - "step": 2082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.196, - "step": 2083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9364, - "step": 2084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3618, - "step": 2085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3755, - "step": 2086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4564, - "step": 2087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4912, - "step": 2088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.113, - "step": 2089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0588, - "step": 2090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.668, - "step": 2091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.08, - "step": 2092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2042, - "step": 2093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4134, - "step": 2094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0456, - "step": 2095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2245, - "step": 2096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4936, - "step": 2097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5158, - "step": 2098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7077, - "step": 2100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6008, - "step": 2101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4652, - "step": 2102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.918, - "step": 2103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5819, - "step": 2104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7764, - "step": 2105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0525, - "step": 2106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5359, - "step": 2107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4925, - "step": 2108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4857, - "step": 2109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9445, - "step": 2110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8494, - "step": 2111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1513, - "step": 2112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2552, - "step": 2113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8571, - "step": 2115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5968, - "step": 2116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8806, - "step": 2117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4641, - "step": 2118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6039, - "step": 2119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1379, - "step": 2120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6688, - "step": 2121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.293, - "step": 2122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5664, - "step": 2123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0825, - "step": 2124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9788, - "step": 2125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9641, - "step": 2126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7799, - "step": 2127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0619, - "step": 2128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0022, - "step": 2129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8022, - "step": 2130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5301, - "step": 2131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.681, - "step": 2132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7362, - "step": 2133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5462, - "step": 2134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2356, - "step": 2135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3646, - "step": 2137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8765, - "step": 2138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6487, - "step": 2139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9622, - "step": 2140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1761, - "step": 2141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0371, - "step": 2143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7869, - "step": 2144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3725, - "step": 2145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8894, - "step": 2146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6083, - "step": 2147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4451, - "step": 2148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1149, - "step": 2149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8058, - "step": 2150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1308, - "step": 2151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1447, - "step": 2152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.208, - "step": 2153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5193, - "step": 2154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7729, - "step": 2155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5019, - "step": 2156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6092, - "step": 2157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1853, - "step": 2158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7, - "step": 2159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1638, - "step": 2160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.762, - "step": 2161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7455, - "step": 2162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9372, - "step": 2163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4569, - "step": 2164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6705, - "step": 2165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1988, - "step": 2166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2526, - "step": 2167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9066, - "step": 2168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1365, - "step": 2169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3422, - "step": 2170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2691, - "step": 2171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9008, - "step": 2172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2555, - "step": 2173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0886, - "step": 2174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0369, - "step": 2175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 2176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2567, - "step": 2177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 2178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5383, - "step": 2179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4797, - "step": 2180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0163, - "step": 2181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2658, - "step": 2182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1337, - "step": 2183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3287, - "step": 2184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7874, - "step": 2185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7153, - "step": 2186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7037, - "step": 2187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4412, - "step": 2188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3912, - "step": 2189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.034, - "step": 2190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4697, - "step": 2191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6243, - "step": 2192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1133, - "step": 2193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9005, - "step": 2194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7386, - "step": 2195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4169, - "step": 2196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8139, - "step": 2197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3012, - "step": 2198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8223, - "step": 2199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3757, - "step": 2200 - }, - { - "epoch": 0.02, - "eval_loss": 6.580160140991211, - "eval_runtime": 22.4971, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.111, - "step": 2200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.755114164352417, - "step": 2200 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 3.68090809417728e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-2200/training_args.bin b/checkpoint-2200/training_args.bin deleted file mode 100644 index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000 --- a/checkpoint-2200/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f -size 6011 diff --git a/checkpoint-2400/README.md b/checkpoint-2400/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-2400/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-2400/adapter_config.json b/checkpoint-2400/adapter_config.json deleted file mode 100644 index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000 --- a/checkpoint-2400/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "down_proj", - "up_proj", - "q_proj", - "gate_proj", - "o_proj", - "v_proj", - "k_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-2400/adapter_model.bin b/checkpoint-2400/adapter_model.bin deleted file mode 100644 index aa38a64bfc3a8cb8c55c290beaf3783f62c8da4e..0000000000000000000000000000000000000000 --- a/checkpoint-2400/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23824721820d37d6fe44fee9306d0e71a5826aebaf3eb2f970cab6872288b55a -size 871609293 diff --git a/checkpoint-2400/added_tokens.json b/checkpoint-2400/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-2400/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-2400/optimizer.pt b/checkpoint-2400/optimizer.pt deleted file mode 100644 index b18659185211a65315480e095ce60e088bb764bf..0000000000000000000000000000000000000000 --- a/checkpoint-2400/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0e124e64b77531d5f98edc3d5e51a56228b9c1bccf94fbc775628676a06fb976 -size 873873439 diff --git a/checkpoint-2400/rng_state.pth b/checkpoint-2400/rng_state.pth deleted file mode 100644 index 2d0f2264ea7662abcadfb8caac1c1afa09fe0b4e..0000000000000000000000000000000000000000 --- a/checkpoint-2400/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb0e855be1f0b0e57ff0dd7ae4a8185049253a39a36749acd9a9b1af0d3ab306 -size 14511 diff --git a/checkpoint-2400/scheduler.pt b/checkpoint-2400/scheduler.pt deleted file mode 100644 index 8346177cbadcccf082c60229d4e2d0d00c246e9d..0000000000000000000000000000000000000000 --- a/checkpoint-2400/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1cf2acca1029437820e5d9cd9f7ccc6efd7468e812d0f38078e4079d268163c1 -size 627 diff --git a/checkpoint-2400/special_tokens_map.json b/checkpoint-2400/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-2400/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-2400/tokenizer.model b/checkpoint-2400/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-2400/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-2400/tokenizer_config.json b/checkpoint-2400/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-2400/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-2400/trainer_state.json b/checkpoint-2400/trainer_state.json deleted file mode 100644 index b3ad1e92c3e098aaf21f9adeeb09ccd7c376158a..0000000000000000000000000000000000000000 --- a/checkpoint-2400/trainer_state.json +++ /dev/null @@ -1,14632 +0,0 @@ -{ - "best_metric": 6.580160140991211, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-2200", - "epoch": 0.018333206019402644, - "global_step": 2400, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8099, - "step": 2001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0567, - "step": 2002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2981, - "step": 2003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 2004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.139, - "step": 2005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.903, - "step": 2006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2182, - "step": 2007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2347, - "step": 2008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8383, - "step": 2009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0211, - "step": 2010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2572, - "step": 2011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2877, - "step": 2012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3577, - "step": 2013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2022, - "step": 2014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2722, - "step": 2015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0552, - "step": 2016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9857, - "step": 2017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0519, - "step": 2018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7118, - "step": 2019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4465, - "step": 2020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3009, - "step": 2021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3614, - "step": 2022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3493, - "step": 2023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 2024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0416, - "step": 2025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.686, - "step": 2026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6021, - "step": 2027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4161, - "step": 2028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0029, - "step": 2029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8579, - "step": 2030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0247, - "step": 2031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4184, - "step": 2032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4962, - "step": 2033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5137, - "step": 2034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6692, - "step": 2035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7161, - "step": 2036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.617, - "step": 2037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.413, - "step": 2038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3056, - "step": 2039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9441, - "step": 2040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9943, - "step": 2041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5703, - "step": 2042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1881, - "step": 2043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5763, - "step": 2044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6389, - "step": 2045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1717, - "step": 2046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5482, - "step": 2047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9469, - "step": 2048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7685, - "step": 2049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1381, - "step": 2050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6961, - "step": 2051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6425, - "step": 2052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5354, - "step": 2053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2404, - "step": 2054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1556, - "step": 2055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7133, - "step": 2056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8166, - "step": 2057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 2058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5429, - "step": 2059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0367, - "step": 2060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5386, - "step": 2061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5899, - "step": 2062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 2063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9951, - "step": 2064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8608, - "step": 2065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4735, - "step": 2066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5612, - "step": 2067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7461, - "step": 2068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5887, - "step": 2069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 2070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5589, - "step": 2071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.498, - "step": 2072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1306, - "step": 2073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3492, - "step": 2074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2311, - "step": 2075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8798, - "step": 2076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6799, - "step": 2077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5011, - "step": 2078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8892, - "step": 2079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6449, - "step": 2080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9117, - "step": 2081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1157, - "step": 2082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.196, - "step": 2083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9364, - "step": 2084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3618, - "step": 2085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3755, - "step": 2086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4564, - "step": 2087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4912, - "step": 2088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.113, - "step": 2089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0588, - "step": 2090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.668, - "step": 2091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.08, - "step": 2092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2042, - "step": 2093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4134, - "step": 2094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0456, - "step": 2095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2245, - "step": 2096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4936, - "step": 2097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5158, - "step": 2098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7077, - "step": 2100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6008, - "step": 2101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4652, - "step": 2102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.918, - "step": 2103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5819, - "step": 2104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7764, - "step": 2105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0525, - "step": 2106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5359, - "step": 2107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4925, - "step": 2108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4857, - "step": 2109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9445, - "step": 2110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8494, - "step": 2111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1513, - "step": 2112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2552, - "step": 2113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8571, - "step": 2115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5968, - "step": 2116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8806, - "step": 2117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4641, - "step": 2118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6039, - "step": 2119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1379, - "step": 2120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6688, - "step": 2121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.293, - "step": 2122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5664, - "step": 2123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0825, - "step": 2124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9788, - "step": 2125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9641, - "step": 2126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7799, - "step": 2127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0619, - "step": 2128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0022, - "step": 2129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8022, - "step": 2130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5301, - "step": 2131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.681, - "step": 2132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7362, - "step": 2133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5462, - "step": 2134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2356, - "step": 2135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3646, - "step": 2137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8765, - "step": 2138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6487, - "step": 2139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9622, - "step": 2140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1761, - "step": 2141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0371, - "step": 2143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7869, - "step": 2144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3725, - "step": 2145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8894, - "step": 2146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6083, - "step": 2147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4451, - "step": 2148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1149, - "step": 2149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8058, - "step": 2150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1308, - "step": 2151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1447, - "step": 2152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.208, - "step": 2153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5193, - "step": 2154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7729, - "step": 2155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5019, - "step": 2156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6092, - "step": 2157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1853, - "step": 2158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7, - "step": 2159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1638, - "step": 2160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.762, - "step": 2161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7455, - "step": 2162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9372, - "step": 2163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4569, - "step": 2164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6705, - "step": 2165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1988, - "step": 2166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2526, - "step": 2167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9066, - "step": 2168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1365, - "step": 2169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3422, - "step": 2170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2691, - "step": 2171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9008, - "step": 2172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2555, - "step": 2173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0886, - "step": 2174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0369, - "step": 2175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 2176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2567, - "step": 2177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 2178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5383, - "step": 2179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4797, - "step": 2180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0163, - "step": 2181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2658, - "step": 2182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1337, - "step": 2183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3287, - "step": 2184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7874, - "step": 2185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7153, - "step": 2186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7037, - "step": 2187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4412, - "step": 2188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3912, - "step": 2189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.034, - "step": 2190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4697, - "step": 2191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6243, - "step": 2192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1133, - "step": 2193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9005, - "step": 2194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7386, - "step": 2195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4169, - "step": 2196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8139, - "step": 2197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3012, - "step": 2198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8223, - "step": 2199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3757, - "step": 2200 - }, - { - "epoch": 0.02, - "eval_loss": 6.580160140991211, - "eval_runtime": 22.4971, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.111, - "step": 2200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.755114164352417, - "step": 2200 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5282, - "step": 2201 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2478, - "step": 2202 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.916, - "step": 2203 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5069, - "step": 2204 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5952, - "step": 2205 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5059, - "step": 2206 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7434, - "step": 2207 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.625, - "step": 2208 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1674, - "step": 2209 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3937, - "step": 2210 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8783, - "step": 2211 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5263, - "step": 2212 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7887, - "step": 2213 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8911, - "step": 2214 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7211, - "step": 2215 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.089, - "step": 2216 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6373, - "step": 2217 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7728, - "step": 2218 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6957, - "step": 2219 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.43, - "step": 2220 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9673, - "step": 2221 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8942, - "step": 2222 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2893, - "step": 2223 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1505, - "step": 2224 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3702, - "step": 2225 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1731, - "step": 2226 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.997, - "step": 2227 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9531, - "step": 2228 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0748, - "step": 2229 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0642, - "step": 2230 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2231 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2265, - "step": 2232 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6461, - "step": 2233 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.064, - "step": 2234 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1414, - "step": 2235 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5375, - "step": 2236 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6348, - "step": 2237 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9975, - "step": 2238 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5242, - "step": 2239 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3179, - "step": 2240 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6054, - "step": 2241 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1832, - "step": 2242 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0572, - "step": 2243 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2049, - "step": 2244 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6348, - "step": 2245 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.67, - "step": 2246 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.5627, - "step": 2247 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1851, - "step": 2248 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6792, - "step": 2249 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6344, - "step": 2250 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7603, - "step": 2251 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7737, - "step": 2252 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5323, - "step": 2253 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4059, - "step": 2254 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9343, - "step": 2255 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0156, - "step": 2256 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1851, - "step": 2257 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.44, - "step": 2258 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9079, - "step": 2259 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2260 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 12.3777, - "step": 2261 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1265, - "step": 2262 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1428, - "step": 2263 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8072, - "step": 2264 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.911, - "step": 2265 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9453, - "step": 2266 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0168, - "step": 2267 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2098, - "step": 2268 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2269 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8449, - "step": 2270 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.394, - "step": 2271 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7642, - "step": 2272 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5555, - "step": 2273 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3576, - "step": 2274 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.386, - "step": 2275 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6677, - "step": 2276 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2385, - "step": 2277 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8074, - "step": 2278 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2963, - "step": 2279 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3612, - "step": 2280 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1837, - "step": 2281 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5882, - "step": 2282 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0968, - "step": 2283 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2376, - "step": 2284 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3835, - "step": 2285 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 2286 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.36, - "step": 2287 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0121, - "step": 2288 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0144, - "step": 2289 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6807, - "step": 2290 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8854, - "step": 2291 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1727, - "step": 2292 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.533, - "step": 2293 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9793, - "step": 2294 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.538, - "step": 2295 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 2296 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.163, - "step": 2297 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1045, - "step": 2298 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0209, - "step": 2299 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9728, - "step": 2300 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8902, - "step": 2301 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3075, - "step": 2302 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.2194, - "step": 2303 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7375, - "step": 2304 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3863, - "step": 2305 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1317, - "step": 2306 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1878, - "step": 2307 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6124, - "step": 2308 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8843, - "step": 2309 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3988, - "step": 2310 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3523, - "step": 2311 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5766, - "step": 2312 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9096, - "step": 2313 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9315, - "step": 2314 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4044, - "step": 2315 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6206, - "step": 2316 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2429, - "step": 2317 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0383, - "step": 2318 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4282, - "step": 2319 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8973, - "step": 2320 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1771, - "step": 2321 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.624, - "step": 2322 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5197, - "step": 2323 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7313, - "step": 2324 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8234, - "step": 2325 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1702, - "step": 2326 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.536, - "step": 2327 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1904, - "step": 2328 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2077, - "step": 2329 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.891, - "step": 2330 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6784, - "step": 2331 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6611, - "step": 2332 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3402, - "step": 2333 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 11.1523, - "step": 2334 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5547, - "step": 2335 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3485, - "step": 2336 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8289, - "step": 2337 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2558, - "step": 2338 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1794, - "step": 2339 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8782, - "step": 2340 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.415, - "step": 2341 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5257, - "step": 2342 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4751, - "step": 2343 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2344 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 2345 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6307, - "step": 2346 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1487, - "step": 2347 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 2348 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6988, - "step": 2349 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1299, - "step": 2350 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9938, - "step": 2351 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4176, - "step": 2352 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0184, - "step": 2353 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2779, - "step": 2354 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0162, - "step": 2355 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 2356 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5505, - "step": 2357 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6445, - "step": 2358 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6513, - "step": 2359 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8503, - "step": 2360 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1817, - "step": 2361 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4376, - "step": 2362 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1351, - "step": 2363 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7566, - "step": 2364 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.626, - "step": 2365 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5818, - "step": 2366 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3033, - "step": 2367 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9289, - "step": 2368 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0301, - "step": 2369 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4713, - "step": 2370 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0931, - "step": 2371 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5812, - "step": 2372 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2272, - "step": 2373 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5174, - "step": 2374 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1849, - "step": 2375 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7496, - "step": 2376 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.776, - "step": 2377 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3555, - "step": 2378 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.688, - "step": 2379 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0143, - "step": 2380 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7788, - "step": 2381 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7772, - "step": 2382 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6875, - "step": 2383 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9944, - "step": 2384 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8363, - "step": 2385 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7276, - "step": 2386 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4892, - "step": 2387 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1083, - "step": 2388 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.834, - "step": 2389 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8406, - "step": 2390 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1168, - "step": 2391 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2535, - "step": 2392 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9025, - "step": 2393 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4481, - "step": 2394 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7631, - "step": 2395 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2051, - "step": 2396 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7816, - "step": 2397 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2398 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1125, - "step": 2399 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5952, - "step": 2400 - }, - { - "epoch": 0.02, - "eval_loss": 6.616010665893555, - "eval_runtime": 22.4801, - "eval_samples_per_second": 2.224, - "eval_steps_per_second": 1.112, - "step": 2400 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.427501640319824, - "step": 2400 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 4.010429591529062e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-2400/training_args.bin b/checkpoint-2400/training_args.bin deleted file mode 100644 index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000 --- a/checkpoint-2400/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f -size 6011 diff --git a/checkpoint-2600/README.md b/checkpoint-2600/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-2600/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-2600/adapter_config.json b/checkpoint-2600/adapter_config.json deleted file mode 100644 index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000 --- a/checkpoint-2600/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "down_proj", - "up_proj", - "q_proj", - "gate_proj", - "o_proj", - "v_proj", - "k_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-2600/adapter_model.bin b/checkpoint-2600/adapter_model.bin deleted file mode 100644 index a68f59f31c8edc9eeb79a36e71b4ecac481e8e17..0000000000000000000000000000000000000000 --- a/checkpoint-2600/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5c5c24c518432dc2a92978262de3604250bfd1be5847484ad75d2cbbe9a78b0 -size 871609293 diff --git a/checkpoint-2600/added_tokens.json b/checkpoint-2600/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-2600/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-2600/optimizer.pt b/checkpoint-2600/optimizer.pt deleted file mode 100644 index b4e2ff63f60f6bb919c734d73228fb4563cbf2a2..0000000000000000000000000000000000000000 --- a/checkpoint-2600/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e399286bddf29fae9880c0806ad5c3a545182e5671745d250d09887d0e6488e1 -size 873873439 diff --git a/checkpoint-2600/rng_state.pth b/checkpoint-2600/rng_state.pth deleted file mode 100644 index 3c948c35740f8fff78829acdb5a291bdb441fd5b..0000000000000000000000000000000000000000 --- a/checkpoint-2600/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:93c31a76c4022e28a921261e573692176e89a3efea64621cc9f63e1fc4fe3e45 -size 14511 diff --git a/checkpoint-2600/scheduler.pt b/checkpoint-2600/scheduler.pt deleted file mode 100644 index 8467d11cb924ab7333c95fd21c844ab27d9f13f7..0000000000000000000000000000000000000000 --- a/checkpoint-2600/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1d4f35a64d8e8e44f3eee69a5887f0a83a9d2c1424e925461278b8523e04e0f -size 627 diff --git a/checkpoint-2600/special_tokens_map.json b/checkpoint-2600/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-2600/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-2600/tokenizer.model b/checkpoint-2600/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-2600/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-2600/tokenizer_config.json b/checkpoint-2600/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-2600/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-2600/trainer_state.json b/checkpoint-2600/trainer_state.json deleted file mode 100644 index e1914a0c83fb145a3c2686f6f8aa7944b0d4513c..0000000000000000000000000000000000000000 --- a/checkpoint-2600/trainer_state.json +++ /dev/null @@ -1,15850 +0,0 @@ -{ - "best_metric": 6.580160140991211, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-2200", - "epoch": 0.019860973187686197, - "global_step": 2600, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8099, - "step": 2001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0567, - "step": 2002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2981, - "step": 2003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 2004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.139, - "step": 2005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.903, - "step": 2006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2182, - "step": 2007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2347, - "step": 2008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8383, - "step": 2009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0211, - "step": 2010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2572, - "step": 2011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2877, - "step": 2012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3577, - "step": 2013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2022, - "step": 2014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2722, - "step": 2015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0552, - "step": 2016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9857, - "step": 2017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0519, - "step": 2018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7118, - "step": 2019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4465, - "step": 2020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3009, - "step": 2021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3614, - "step": 2022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3493, - "step": 2023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 2024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0416, - "step": 2025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.686, - "step": 2026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6021, - "step": 2027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4161, - "step": 2028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0029, - "step": 2029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8579, - "step": 2030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0247, - "step": 2031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4184, - "step": 2032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4962, - "step": 2033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5137, - "step": 2034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6692, - "step": 2035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7161, - "step": 2036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.617, - "step": 2037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.413, - "step": 2038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3056, - "step": 2039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9441, - "step": 2040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9943, - "step": 2041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5703, - "step": 2042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1881, - "step": 2043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5763, - "step": 2044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6389, - "step": 2045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1717, - "step": 2046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5482, - "step": 2047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9469, - "step": 2048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7685, - "step": 2049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1381, - "step": 2050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6961, - "step": 2051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6425, - "step": 2052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5354, - "step": 2053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2404, - "step": 2054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1556, - "step": 2055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7133, - "step": 2056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8166, - "step": 2057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 2058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5429, - "step": 2059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0367, - "step": 2060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5386, - "step": 2061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5899, - "step": 2062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 2063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9951, - "step": 2064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8608, - "step": 2065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4735, - "step": 2066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5612, - "step": 2067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7461, - "step": 2068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5887, - "step": 2069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 2070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5589, - "step": 2071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.498, - "step": 2072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1306, - "step": 2073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3492, - "step": 2074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2311, - "step": 2075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8798, - "step": 2076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6799, - "step": 2077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5011, - "step": 2078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8892, - "step": 2079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6449, - "step": 2080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9117, - "step": 2081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1157, - "step": 2082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.196, - "step": 2083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9364, - "step": 2084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3618, - "step": 2085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3755, - "step": 2086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4564, - "step": 2087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4912, - "step": 2088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.113, - "step": 2089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0588, - "step": 2090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.668, - "step": 2091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.08, - "step": 2092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2042, - "step": 2093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4134, - "step": 2094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0456, - "step": 2095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2245, - "step": 2096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4936, - "step": 2097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5158, - "step": 2098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7077, - "step": 2100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6008, - "step": 2101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4652, - "step": 2102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.918, - "step": 2103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5819, - "step": 2104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7764, - "step": 2105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0525, - "step": 2106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5359, - "step": 2107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4925, - "step": 2108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4857, - "step": 2109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9445, - "step": 2110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8494, - "step": 2111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1513, - "step": 2112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2552, - "step": 2113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8571, - "step": 2115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5968, - "step": 2116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8806, - "step": 2117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4641, - "step": 2118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6039, - "step": 2119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1379, - "step": 2120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6688, - "step": 2121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.293, - "step": 2122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5664, - "step": 2123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0825, - "step": 2124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9788, - "step": 2125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9641, - "step": 2126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7799, - "step": 2127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0619, - "step": 2128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0022, - "step": 2129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8022, - "step": 2130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5301, - "step": 2131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.681, - "step": 2132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7362, - "step": 2133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5462, - "step": 2134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2356, - "step": 2135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3646, - "step": 2137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8765, - "step": 2138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6487, - "step": 2139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9622, - "step": 2140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1761, - "step": 2141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0371, - "step": 2143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7869, - "step": 2144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3725, - "step": 2145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8894, - "step": 2146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6083, - "step": 2147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4451, - "step": 2148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1149, - "step": 2149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8058, - "step": 2150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1308, - "step": 2151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1447, - "step": 2152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.208, - "step": 2153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5193, - "step": 2154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7729, - "step": 2155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5019, - "step": 2156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6092, - "step": 2157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1853, - "step": 2158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7, - "step": 2159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1638, - "step": 2160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.762, - "step": 2161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7455, - "step": 2162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9372, - "step": 2163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4569, - "step": 2164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6705, - "step": 2165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1988, - "step": 2166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2526, - "step": 2167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9066, - "step": 2168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1365, - "step": 2169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3422, - "step": 2170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2691, - "step": 2171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9008, - "step": 2172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2555, - "step": 2173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0886, - "step": 2174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0369, - "step": 2175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 2176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2567, - "step": 2177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 2178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5383, - "step": 2179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4797, - "step": 2180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0163, - "step": 2181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2658, - "step": 2182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1337, - "step": 2183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3287, - "step": 2184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7874, - "step": 2185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7153, - "step": 2186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7037, - "step": 2187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4412, - "step": 2188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3912, - "step": 2189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.034, - "step": 2190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4697, - "step": 2191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6243, - "step": 2192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1133, - "step": 2193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9005, - "step": 2194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7386, - "step": 2195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4169, - "step": 2196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8139, - "step": 2197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3012, - "step": 2198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8223, - "step": 2199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3757, - "step": 2200 - }, - { - "epoch": 0.02, - "eval_loss": 6.580160140991211, - "eval_runtime": 22.4971, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.111, - "step": 2200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.755114164352417, - "step": 2200 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5282, - "step": 2201 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2478, - "step": 2202 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.916, - "step": 2203 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5069, - "step": 2204 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5952, - "step": 2205 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5059, - "step": 2206 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7434, - "step": 2207 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.625, - "step": 2208 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1674, - "step": 2209 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3937, - "step": 2210 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8783, - "step": 2211 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5263, - "step": 2212 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7887, - "step": 2213 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8911, - "step": 2214 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7211, - "step": 2215 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.089, - "step": 2216 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6373, - "step": 2217 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7728, - "step": 2218 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6957, - "step": 2219 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.43, - "step": 2220 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9673, - "step": 2221 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8942, - "step": 2222 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2893, - "step": 2223 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1505, - "step": 2224 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3702, - "step": 2225 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1731, - "step": 2226 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.997, - "step": 2227 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9531, - "step": 2228 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0748, - "step": 2229 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0642, - "step": 2230 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2231 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2265, - "step": 2232 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6461, - "step": 2233 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.064, - "step": 2234 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1414, - "step": 2235 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5375, - "step": 2236 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6348, - "step": 2237 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9975, - "step": 2238 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5242, - "step": 2239 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3179, - "step": 2240 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6054, - "step": 2241 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1832, - "step": 2242 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0572, - "step": 2243 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2049, - "step": 2244 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6348, - "step": 2245 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.67, - "step": 2246 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.5627, - "step": 2247 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1851, - "step": 2248 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6792, - "step": 2249 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6344, - "step": 2250 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7603, - "step": 2251 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7737, - "step": 2252 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5323, - "step": 2253 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4059, - "step": 2254 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9343, - "step": 2255 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0156, - "step": 2256 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1851, - "step": 2257 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.44, - "step": 2258 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9079, - "step": 2259 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2260 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 12.3777, - "step": 2261 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1265, - "step": 2262 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1428, - "step": 2263 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8072, - "step": 2264 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.911, - "step": 2265 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9453, - "step": 2266 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0168, - "step": 2267 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2098, - "step": 2268 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2269 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8449, - "step": 2270 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.394, - "step": 2271 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7642, - "step": 2272 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5555, - "step": 2273 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3576, - "step": 2274 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.386, - "step": 2275 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6677, - "step": 2276 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2385, - "step": 2277 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8074, - "step": 2278 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2963, - "step": 2279 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3612, - "step": 2280 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1837, - "step": 2281 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5882, - "step": 2282 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0968, - "step": 2283 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2376, - "step": 2284 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3835, - "step": 2285 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 2286 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.36, - "step": 2287 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0121, - "step": 2288 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0144, - "step": 2289 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6807, - "step": 2290 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8854, - "step": 2291 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1727, - "step": 2292 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.533, - "step": 2293 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9793, - "step": 2294 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.538, - "step": 2295 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 2296 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.163, - "step": 2297 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1045, - "step": 2298 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0209, - "step": 2299 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9728, - "step": 2300 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8902, - "step": 2301 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3075, - "step": 2302 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.2194, - "step": 2303 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7375, - "step": 2304 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3863, - "step": 2305 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1317, - "step": 2306 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1878, - "step": 2307 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6124, - "step": 2308 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8843, - "step": 2309 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3988, - "step": 2310 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3523, - "step": 2311 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5766, - "step": 2312 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9096, - "step": 2313 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9315, - "step": 2314 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4044, - "step": 2315 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6206, - "step": 2316 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2429, - "step": 2317 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0383, - "step": 2318 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4282, - "step": 2319 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8973, - "step": 2320 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1771, - "step": 2321 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.624, - "step": 2322 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5197, - "step": 2323 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7313, - "step": 2324 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8234, - "step": 2325 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1702, - "step": 2326 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.536, - "step": 2327 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1904, - "step": 2328 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2077, - "step": 2329 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.891, - "step": 2330 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6784, - "step": 2331 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6611, - "step": 2332 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3402, - "step": 2333 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 11.1523, - "step": 2334 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5547, - "step": 2335 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3485, - "step": 2336 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8289, - "step": 2337 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2558, - "step": 2338 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1794, - "step": 2339 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8782, - "step": 2340 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.415, - "step": 2341 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5257, - "step": 2342 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4751, - "step": 2343 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2344 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 2345 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6307, - "step": 2346 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1487, - "step": 2347 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 2348 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6988, - "step": 2349 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1299, - "step": 2350 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9938, - "step": 2351 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4176, - "step": 2352 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0184, - "step": 2353 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2779, - "step": 2354 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0162, - "step": 2355 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 2356 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5505, - "step": 2357 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6445, - "step": 2358 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6513, - "step": 2359 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8503, - "step": 2360 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1817, - "step": 2361 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4376, - "step": 2362 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1351, - "step": 2363 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7566, - "step": 2364 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.626, - "step": 2365 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5818, - "step": 2366 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3033, - "step": 2367 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9289, - "step": 2368 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0301, - "step": 2369 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4713, - "step": 2370 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0931, - "step": 2371 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5812, - "step": 2372 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2272, - "step": 2373 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5174, - "step": 2374 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1849, - "step": 2375 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7496, - "step": 2376 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.776, - "step": 2377 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3555, - "step": 2378 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.688, - "step": 2379 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0143, - "step": 2380 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7788, - "step": 2381 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7772, - "step": 2382 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6875, - "step": 2383 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9944, - "step": 2384 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8363, - "step": 2385 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7276, - "step": 2386 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4892, - "step": 2387 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1083, - "step": 2388 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.834, - "step": 2389 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8406, - "step": 2390 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1168, - "step": 2391 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2535, - "step": 2392 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9025, - "step": 2393 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4481, - "step": 2394 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7631, - "step": 2395 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2051, - "step": 2396 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7816, - "step": 2397 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2398 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1125, - "step": 2399 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5952, - "step": 2400 - }, - { - "epoch": 0.02, - "eval_loss": 6.616010665893555, - "eval_runtime": 22.4801, - "eval_samples_per_second": 2.224, - "eval_steps_per_second": 1.112, - "step": 2400 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.427501640319824, - "step": 2400 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6081, - "step": 2401 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2359, - "step": 2402 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2628, - "step": 2403 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8465, - "step": 2404 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6746, - "step": 2405 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1066, - "step": 2406 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4981, - "step": 2407 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9194, - "step": 2408 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.239, - "step": 2409 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2410 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4597, - "step": 2411 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 2412 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4122, - "step": 2413 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7398, - "step": 2414 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5009, - "step": 2415 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2757, - "step": 2416 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4202, - "step": 2417 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.108, - "step": 2418 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3303, - "step": 2419 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4671, - "step": 2420 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5768, - "step": 2421 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9172, - "step": 2422 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7606, - "step": 2423 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0745, - "step": 2424 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2907, - "step": 2425 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6956, - "step": 2426 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4414, - "step": 2427 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9823, - "step": 2428 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6486, - "step": 2429 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5173, - "step": 2430 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 2431 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9405, - "step": 2432 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4226, - "step": 2433 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4334, - "step": 2434 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9136, - "step": 2435 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6707, - "step": 2436 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6107, - "step": 2437 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5296, - "step": 2438 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0736, - "step": 2439 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4398, - "step": 2440 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5669, - "step": 2441 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.303, - "step": 2442 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2993, - "step": 2443 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9063, - "step": 2444 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3015, - "step": 2445 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3117, - "step": 2446 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6228, - "step": 2447 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6216, - "step": 2448 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6188, - "step": 2449 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8996, - "step": 2450 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5802, - "step": 2451 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2603, - "step": 2452 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0921, - "step": 2453 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9377, - "step": 2454 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0934, - "step": 2455 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9832, - "step": 2456 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1084, - "step": 2457 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2592, - "step": 2458 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8545, - "step": 2459 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4335, - "step": 2460 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5591, - "step": 2461 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.284, - "step": 2462 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8171, - "step": 2463 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 2464 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1355, - "step": 2465 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6885, - "step": 2466 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.311, - "step": 2467 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.72, - "step": 2468 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.007, - "step": 2469 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2275, - "step": 2470 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.129, - "step": 2471 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9531, - "step": 2472 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7327, - "step": 2473 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5001, - "step": 2474 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9443, - "step": 2475 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6005, - "step": 2476 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5551, - "step": 2477 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3044, - "step": 2478 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6257, - "step": 2479 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5029, - "step": 2480 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3717, - "step": 2481 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5226, - "step": 2482 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2921, - "step": 2483 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7169, - "step": 2484 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2185, - "step": 2485 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5393, - "step": 2486 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0286, - "step": 2487 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3738, - "step": 2488 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2249, - "step": 2489 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7828, - "step": 2490 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.87, - "step": 2491 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.681, - "step": 2492 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5446, - "step": 2493 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0769, - "step": 2494 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3587, - "step": 2495 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9188, - "step": 2496 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9357, - "step": 2497 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3449, - "step": 2498 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2753, - "step": 2499 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3077, - "step": 2500 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0085, - "step": 2501 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5436, - "step": 2502 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9096, - "step": 2503 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7288, - "step": 2504 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7344, - "step": 2505 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6385, - "step": 2506 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6713, - "step": 2507 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6065, - "step": 2508 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3456, - "step": 2509 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1633, - "step": 2510 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5557, - "step": 2511 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7075, - "step": 2512 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4954, - "step": 2513 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5104, - "step": 2514 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5706, - "step": 2515 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7479, - "step": 2516 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7042, - "step": 2517 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9569, - "step": 2518 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7846, - "step": 2519 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.749, - "step": 2520 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5868, - "step": 2521 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3957, - "step": 2522 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2594, - "step": 2523 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 2524 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.023, - "step": 2525 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0423, - "step": 2526 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1901, - "step": 2527 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0025, - "step": 2528 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0252, - "step": 2529 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 2530 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6864, - "step": 2531 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1174, - "step": 2532 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.46, - "step": 2533 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3714, - "step": 2534 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1206, - "step": 2535 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3734, - "step": 2536 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2537 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0124, - "step": 2538 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2105, - "step": 2539 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 2540 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1163, - "step": 2541 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5255, - "step": 2542 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2058, - "step": 2543 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7425, - "step": 2544 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3621, - "step": 2545 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7541, - "step": 2546 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9611, - "step": 2547 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3031, - "step": 2548 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1756, - "step": 2549 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6146, - "step": 2550 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1434, - "step": 2551 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0786, - "step": 2552 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9498, - "step": 2553 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8681, - "step": 2554 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5898, - "step": 2555 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7522, - "step": 2556 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3648, - "step": 2557 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8895, - "step": 2558 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9903, - "step": 2559 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1113, - "step": 2560 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6541, - "step": 2561 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8563, - "step": 2562 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0685, - "step": 2563 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.59, - "step": 2564 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0927, - "step": 2565 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3792, - "step": 2566 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.208, - "step": 2567 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9275, - "step": 2568 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.25, - "step": 2569 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9524, - "step": 2570 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.556, - "step": 2571 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6414, - "step": 2572 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2573 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4529, - "step": 2574 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9764, - "step": 2575 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1835, - "step": 2576 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.438, - "step": 2577 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.834, - "step": 2578 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8879, - "step": 2579 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 2580 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 2581 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7319, - "step": 2582 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3287, - "step": 2583 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3322, - "step": 2584 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0278, - "step": 2585 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5355, - "step": 2586 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2016, - "step": 2587 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8335, - "step": 2588 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.721, - "step": 2589 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4628, - "step": 2590 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7896, - "step": 2591 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7226, - "step": 2592 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5834, - "step": 2593 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8944, - "step": 2594 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1519, - "step": 2595 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 2596 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9751, - "step": 2597 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1408, - "step": 2598 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2469, - "step": 2599 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3136, - "step": 2600 - }, - { - "epoch": 0.02, - "eval_loss": 6.580307483673096, - "eval_runtime": 22.5866, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 2600 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.17715097402597402, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.21428571428571427, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.0, - "mmlu_loss": 3.684196367263794, - "step": 2600 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 4.352849065613722e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-2600/training_args.bin b/checkpoint-2600/training_args.bin deleted file mode 100644 index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000 --- a/checkpoint-2600/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f -size 6011 diff --git a/checkpoint-2800/README.md b/checkpoint-2800/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-2800/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-2800/adapter_config.json b/checkpoint-2800/adapter_config.json deleted file mode 100644 index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000 --- a/checkpoint-2800/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "down_proj", - "up_proj", - "q_proj", - "gate_proj", - "o_proj", - "v_proj", - "k_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-2800/adapter_model.bin b/checkpoint-2800/adapter_model.bin deleted file mode 100644 index ed889d01d441e1146f24c7f093fa697b1fe5dd20..0000000000000000000000000000000000000000 --- a/checkpoint-2800/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f225cf9428443345e7ae8d074f6c79ee61a6dba536cd308e438af491df396828 -size 871609293 diff --git a/checkpoint-2800/added_tokens.json b/checkpoint-2800/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-2800/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-2800/optimizer.pt b/checkpoint-2800/optimizer.pt deleted file mode 100644 index 6a99d6b0d6127618463ee1a2d168acb145fdf9cf..0000000000000000000000000000000000000000 --- a/checkpoint-2800/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79fcc33a32a377baf845df2980411ec64b9e5c19908b53d5583187466dcc4980 -size 873873439 diff --git a/checkpoint-2800/rng_state.pth b/checkpoint-2800/rng_state.pth deleted file mode 100644 index 123f0ce9f4aa72f05cc5020704de19780b071fa5..0000000000000000000000000000000000000000 --- a/checkpoint-2800/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9eab4de808f8212c8f41a34d0aff543805b7f7ba5ec8f1165b90adc0dcd2dcdb -size 14511 diff --git a/checkpoint-2800/scheduler.pt b/checkpoint-2800/scheduler.pt deleted file mode 100644 index 9c65408bf410f9c41839c25ee20830578d47a42c..0000000000000000000000000000000000000000 --- a/checkpoint-2800/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad4a2f0e95230b57ff6f263c61d2ad6bdd419ee16f185bfa84113511ab58dc70 -size 627 diff --git a/checkpoint-2800/special_tokens_map.json b/checkpoint-2800/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-2800/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-2800/tokenizer.model b/checkpoint-2800/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-2800/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-2800/tokenizer_config.json b/checkpoint-2800/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-2800/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-2800/trainer_state.json b/checkpoint-2800/trainer_state.json deleted file mode 100644 index b2ea839bc006f83f8a3cde479717f163de4ac05d..0000000000000000000000000000000000000000 --- a/checkpoint-2800/trainer_state.json +++ /dev/null @@ -1,17068 +0,0 @@ -{ - "best_metric": 6.580160140991211, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-2200", - "epoch": 0.02138874035596975, - "global_step": 2800, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8099, - "step": 2001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0567, - "step": 2002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2981, - "step": 2003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 2004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.139, - "step": 2005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.903, - "step": 2006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2182, - "step": 2007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2347, - "step": 2008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8383, - "step": 2009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0211, - "step": 2010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2572, - "step": 2011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2877, - "step": 2012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3577, - "step": 2013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2022, - "step": 2014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2722, - "step": 2015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0552, - "step": 2016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9857, - "step": 2017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0519, - "step": 2018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7118, - "step": 2019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4465, - "step": 2020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3009, - "step": 2021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3614, - "step": 2022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3493, - "step": 2023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 2024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0416, - "step": 2025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.686, - "step": 2026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6021, - "step": 2027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4161, - "step": 2028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0029, - "step": 2029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8579, - "step": 2030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0247, - "step": 2031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4184, - "step": 2032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4962, - "step": 2033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5137, - "step": 2034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6692, - "step": 2035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7161, - "step": 2036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.617, - "step": 2037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.413, - "step": 2038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3056, - "step": 2039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9441, - "step": 2040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9943, - "step": 2041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5703, - "step": 2042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1881, - "step": 2043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5763, - "step": 2044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6389, - "step": 2045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1717, - "step": 2046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5482, - "step": 2047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9469, - "step": 2048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7685, - "step": 2049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1381, - "step": 2050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6961, - "step": 2051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6425, - "step": 2052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5354, - "step": 2053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2404, - "step": 2054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1556, - "step": 2055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7133, - "step": 2056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8166, - "step": 2057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 2058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5429, - "step": 2059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0367, - "step": 2060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5386, - "step": 2061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5899, - "step": 2062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 2063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9951, - "step": 2064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8608, - "step": 2065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4735, - "step": 2066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5612, - "step": 2067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7461, - "step": 2068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5887, - "step": 2069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 2070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5589, - "step": 2071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.498, - "step": 2072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1306, - "step": 2073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3492, - "step": 2074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2311, - "step": 2075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8798, - "step": 2076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6799, - "step": 2077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5011, - "step": 2078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8892, - "step": 2079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6449, - "step": 2080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9117, - "step": 2081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1157, - "step": 2082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.196, - "step": 2083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9364, - "step": 2084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3618, - "step": 2085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3755, - "step": 2086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4564, - "step": 2087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4912, - "step": 2088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.113, - "step": 2089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0588, - "step": 2090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.668, - "step": 2091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.08, - "step": 2092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2042, - "step": 2093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4134, - "step": 2094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0456, - "step": 2095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2245, - "step": 2096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4936, - "step": 2097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5158, - "step": 2098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7077, - "step": 2100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6008, - "step": 2101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4652, - "step": 2102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.918, - "step": 2103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5819, - "step": 2104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7764, - "step": 2105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0525, - "step": 2106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5359, - "step": 2107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4925, - "step": 2108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4857, - "step": 2109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9445, - "step": 2110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8494, - "step": 2111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1513, - "step": 2112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2552, - "step": 2113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8571, - "step": 2115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5968, - "step": 2116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8806, - "step": 2117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4641, - "step": 2118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6039, - "step": 2119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1379, - "step": 2120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6688, - "step": 2121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.293, - "step": 2122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5664, - "step": 2123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0825, - "step": 2124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9788, - "step": 2125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9641, - "step": 2126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7799, - "step": 2127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0619, - "step": 2128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0022, - "step": 2129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8022, - "step": 2130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5301, - "step": 2131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.681, - "step": 2132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7362, - "step": 2133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5462, - "step": 2134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2356, - "step": 2135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3646, - "step": 2137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8765, - "step": 2138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6487, - "step": 2139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9622, - "step": 2140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1761, - "step": 2141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0371, - "step": 2143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7869, - "step": 2144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3725, - "step": 2145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8894, - "step": 2146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6083, - "step": 2147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4451, - "step": 2148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1149, - "step": 2149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8058, - "step": 2150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1308, - "step": 2151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1447, - "step": 2152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.208, - "step": 2153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5193, - "step": 2154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7729, - "step": 2155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5019, - "step": 2156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6092, - "step": 2157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1853, - "step": 2158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7, - "step": 2159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1638, - "step": 2160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.762, - "step": 2161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7455, - "step": 2162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9372, - "step": 2163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4569, - "step": 2164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6705, - "step": 2165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1988, - "step": 2166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2526, - "step": 2167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9066, - "step": 2168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1365, - "step": 2169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3422, - "step": 2170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2691, - "step": 2171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9008, - "step": 2172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2555, - "step": 2173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0886, - "step": 2174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0369, - "step": 2175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 2176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2567, - "step": 2177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 2178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5383, - "step": 2179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4797, - "step": 2180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0163, - "step": 2181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2658, - "step": 2182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1337, - "step": 2183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3287, - "step": 2184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7874, - "step": 2185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7153, - "step": 2186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7037, - "step": 2187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4412, - "step": 2188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3912, - "step": 2189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.034, - "step": 2190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4697, - "step": 2191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6243, - "step": 2192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1133, - "step": 2193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9005, - "step": 2194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7386, - "step": 2195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4169, - "step": 2196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8139, - "step": 2197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3012, - "step": 2198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8223, - "step": 2199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3757, - "step": 2200 - }, - { - "epoch": 0.02, - "eval_loss": 6.580160140991211, - "eval_runtime": 22.4971, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.111, - "step": 2200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.755114164352417, - "step": 2200 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5282, - "step": 2201 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2478, - "step": 2202 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.916, - "step": 2203 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5069, - "step": 2204 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5952, - "step": 2205 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5059, - "step": 2206 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7434, - "step": 2207 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.625, - "step": 2208 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1674, - "step": 2209 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3937, - "step": 2210 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8783, - "step": 2211 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5263, - "step": 2212 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7887, - "step": 2213 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8911, - "step": 2214 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7211, - "step": 2215 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.089, - "step": 2216 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6373, - "step": 2217 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7728, - "step": 2218 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6957, - "step": 2219 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.43, - "step": 2220 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9673, - "step": 2221 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8942, - "step": 2222 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2893, - "step": 2223 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1505, - "step": 2224 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3702, - "step": 2225 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1731, - "step": 2226 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.997, - "step": 2227 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9531, - "step": 2228 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0748, - "step": 2229 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0642, - "step": 2230 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2231 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2265, - "step": 2232 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6461, - "step": 2233 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.064, - "step": 2234 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1414, - "step": 2235 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5375, - "step": 2236 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6348, - "step": 2237 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9975, - "step": 2238 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5242, - "step": 2239 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3179, - "step": 2240 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6054, - "step": 2241 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1832, - "step": 2242 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0572, - "step": 2243 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2049, - "step": 2244 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6348, - "step": 2245 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.67, - "step": 2246 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.5627, - "step": 2247 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1851, - "step": 2248 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6792, - "step": 2249 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6344, - "step": 2250 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7603, - "step": 2251 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7737, - "step": 2252 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5323, - "step": 2253 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4059, - "step": 2254 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9343, - "step": 2255 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0156, - "step": 2256 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1851, - "step": 2257 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.44, - "step": 2258 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9079, - "step": 2259 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2260 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 12.3777, - "step": 2261 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1265, - "step": 2262 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1428, - "step": 2263 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8072, - "step": 2264 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.911, - "step": 2265 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9453, - "step": 2266 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0168, - "step": 2267 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2098, - "step": 2268 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2269 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8449, - "step": 2270 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.394, - "step": 2271 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7642, - "step": 2272 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5555, - "step": 2273 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3576, - "step": 2274 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.386, - "step": 2275 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6677, - "step": 2276 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2385, - "step": 2277 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8074, - "step": 2278 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2963, - "step": 2279 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3612, - "step": 2280 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1837, - "step": 2281 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5882, - "step": 2282 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0968, - "step": 2283 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2376, - "step": 2284 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3835, - "step": 2285 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 2286 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.36, - "step": 2287 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0121, - "step": 2288 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0144, - "step": 2289 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6807, - "step": 2290 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8854, - "step": 2291 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1727, - "step": 2292 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.533, - "step": 2293 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9793, - "step": 2294 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.538, - "step": 2295 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 2296 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.163, - "step": 2297 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1045, - "step": 2298 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0209, - "step": 2299 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9728, - "step": 2300 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8902, - "step": 2301 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3075, - "step": 2302 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.2194, - "step": 2303 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7375, - "step": 2304 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3863, - "step": 2305 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1317, - "step": 2306 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1878, - "step": 2307 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6124, - "step": 2308 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8843, - "step": 2309 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3988, - "step": 2310 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3523, - "step": 2311 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5766, - "step": 2312 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9096, - "step": 2313 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9315, - "step": 2314 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4044, - "step": 2315 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6206, - "step": 2316 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2429, - "step": 2317 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0383, - "step": 2318 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4282, - "step": 2319 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8973, - "step": 2320 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1771, - "step": 2321 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.624, - "step": 2322 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5197, - "step": 2323 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7313, - "step": 2324 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8234, - "step": 2325 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1702, - "step": 2326 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.536, - "step": 2327 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1904, - "step": 2328 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2077, - "step": 2329 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.891, - "step": 2330 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6784, - "step": 2331 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6611, - "step": 2332 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3402, - "step": 2333 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 11.1523, - "step": 2334 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5547, - "step": 2335 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3485, - "step": 2336 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8289, - "step": 2337 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2558, - "step": 2338 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1794, - "step": 2339 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8782, - "step": 2340 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.415, - "step": 2341 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5257, - "step": 2342 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4751, - "step": 2343 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2344 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 2345 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6307, - "step": 2346 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1487, - "step": 2347 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 2348 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6988, - "step": 2349 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1299, - "step": 2350 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9938, - "step": 2351 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4176, - "step": 2352 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0184, - "step": 2353 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2779, - "step": 2354 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0162, - "step": 2355 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 2356 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5505, - "step": 2357 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6445, - "step": 2358 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6513, - "step": 2359 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8503, - "step": 2360 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1817, - "step": 2361 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4376, - "step": 2362 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1351, - "step": 2363 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7566, - "step": 2364 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.626, - "step": 2365 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5818, - "step": 2366 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3033, - "step": 2367 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9289, - "step": 2368 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0301, - "step": 2369 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4713, - "step": 2370 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0931, - "step": 2371 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5812, - "step": 2372 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2272, - "step": 2373 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5174, - "step": 2374 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1849, - "step": 2375 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7496, - "step": 2376 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.776, - "step": 2377 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3555, - "step": 2378 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.688, - "step": 2379 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0143, - "step": 2380 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7788, - "step": 2381 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7772, - "step": 2382 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6875, - "step": 2383 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9944, - "step": 2384 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8363, - "step": 2385 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7276, - "step": 2386 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4892, - "step": 2387 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1083, - "step": 2388 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.834, - "step": 2389 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8406, - "step": 2390 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1168, - "step": 2391 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2535, - "step": 2392 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9025, - "step": 2393 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4481, - "step": 2394 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7631, - "step": 2395 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2051, - "step": 2396 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7816, - "step": 2397 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2398 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1125, - "step": 2399 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5952, - "step": 2400 - }, - { - "epoch": 0.02, - "eval_loss": 6.616010665893555, - "eval_runtime": 22.4801, - "eval_samples_per_second": 2.224, - "eval_steps_per_second": 1.112, - "step": 2400 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.427501640319824, - "step": 2400 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6081, - "step": 2401 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2359, - "step": 2402 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2628, - "step": 2403 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8465, - "step": 2404 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6746, - "step": 2405 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1066, - "step": 2406 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4981, - "step": 2407 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9194, - "step": 2408 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.239, - "step": 2409 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2410 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4597, - "step": 2411 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 2412 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4122, - "step": 2413 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7398, - "step": 2414 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5009, - "step": 2415 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2757, - "step": 2416 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4202, - "step": 2417 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.108, - "step": 2418 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3303, - "step": 2419 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4671, - "step": 2420 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5768, - "step": 2421 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9172, - "step": 2422 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7606, - "step": 2423 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0745, - "step": 2424 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2907, - "step": 2425 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6956, - "step": 2426 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4414, - "step": 2427 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9823, - "step": 2428 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6486, - "step": 2429 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5173, - "step": 2430 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 2431 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9405, - "step": 2432 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4226, - "step": 2433 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4334, - "step": 2434 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9136, - "step": 2435 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6707, - "step": 2436 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6107, - "step": 2437 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5296, - "step": 2438 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0736, - "step": 2439 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4398, - "step": 2440 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5669, - "step": 2441 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.303, - "step": 2442 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2993, - "step": 2443 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9063, - "step": 2444 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3015, - "step": 2445 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3117, - "step": 2446 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6228, - "step": 2447 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6216, - "step": 2448 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6188, - "step": 2449 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8996, - "step": 2450 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5802, - "step": 2451 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2603, - "step": 2452 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0921, - "step": 2453 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9377, - "step": 2454 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0934, - "step": 2455 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9832, - "step": 2456 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1084, - "step": 2457 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2592, - "step": 2458 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8545, - "step": 2459 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4335, - "step": 2460 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5591, - "step": 2461 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.284, - "step": 2462 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8171, - "step": 2463 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 2464 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1355, - "step": 2465 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6885, - "step": 2466 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.311, - "step": 2467 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.72, - "step": 2468 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.007, - "step": 2469 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2275, - "step": 2470 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.129, - "step": 2471 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9531, - "step": 2472 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7327, - "step": 2473 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5001, - "step": 2474 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9443, - "step": 2475 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6005, - "step": 2476 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5551, - "step": 2477 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3044, - "step": 2478 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6257, - "step": 2479 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5029, - "step": 2480 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3717, - "step": 2481 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5226, - "step": 2482 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2921, - "step": 2483 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7169, - "step": 2484 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2185, - "step": 2485 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5393, - "step": 2486 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0286, - "step": 2487 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3738, - "step": 2488 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2249, - "step": 2489 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7828, - "step": 2490 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.87, - "step": 2491 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.681, - "step": 2492 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5446, - "step": 2493 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0769, - "step": 2494 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3587, - "step": 2495 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9188, - "step": 2496 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9357, - "step": 2497 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3449, - "step": 2498 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2753, - "step": 2499 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3077, - "step": 2500 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0085, - "step": 2501 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5436, - "step": 2502 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9096, - "step": 2503 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7288, - "step": 2504 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7344, - "step": 2505 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6385, - "step": 2506 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6713, - "step": 2507 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6065, - "step": 2508 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3456, - "step": 2509 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1633, - "step": 2510 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5557, - "step": 2511 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7075, - "step": 2512 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4954, - "step": 2513 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5104, - "step": 2514 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5706, - "step": 2515 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7479, - "step": 2516 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7042, - "step": 2517 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9569, - "step": 2518 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7846, - "step": 2519 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.749, - "step": 2520 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5868, - "step": 2521 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3957, - "step": 2522 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2594, - "step": 2523 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 2524 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.023, - "step": 2525 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0423, - "step": 2526 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1901, - "step": 2527 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0025, - "step": 2528 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0252, - "step": 2529 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 2530 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6864, - "step": 2531 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1174, - "step": 2532 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.46, - "step": 2533 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3714, - "step": 2534 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1206, - "step": 2535 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3734, - "step": 2536 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2537 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0124, - "step": 2538 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2105, - "step": 2539 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 2540 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1163, - "step": 2541 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5255, - "step": 2542 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2058, - "step": 2543 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7425, - "step": 2544 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3621, - "step": 2545 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7541, - "step": 2546 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9611, - "step": 2547 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3031, - "step": 2548 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1756, - "step": 2549 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6146, - "step": 2550 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1434, - "step": 2551 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0786, - "step": 2552 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9498, - "step": 2553 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8681, - "step": 2554 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5898, - "step": 2555 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7522, - "step": 2556 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3648, - "step": 2557 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8895, - "step": 2558 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9903, - "step": 2559 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1113, - "step": 2560 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6541, - "step": 2561 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8563, - "step": 2562 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0685, - "step": 2563 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.59, - "step": 2564 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0927, - "step": 2565 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3792, - "step": 2566 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.208, - "step": 2567 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9275, - "step": 2568 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.25, - "step": 2569 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9524, - "step": 2570 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.556, - "step": 2571 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6414, - "step": 2572 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2573 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4529, - "step": 2574 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9764, - "step": 2575 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1835, - "step": 2576 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.438, - "step": 2577 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.834, - "step": 2578 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8879, - "step": 2579 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 2580 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 2581 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7319, - "step": 2582 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3287, - "step": 2583 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3322, - "step": 2584 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0278, - "step": 2585 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5355, - "step": 2586 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2016, - "step": 2587 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8335, - "step": 2588 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.721, - "step": 2589 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4628, - "step": 2590 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7896, - "step": 2591 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7226, - "step": 2592 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5834, - "step": 2593 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8944, - "step": 2594 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1519, - "step": 2595 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 2596 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9751, - "step": 2597 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1408, - "step": 2598 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2469, - "step": 2599 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3136, - "step": 2600 - }, - { - "epoch": 0.02, - "eval_loss": 6.580307483673096, - "eval_runtime": 22.5866, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 2600 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.17715097402597402, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.21428571428571427, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.0, - "mmlu_loss": 3.684196367263794, - "step": 2600 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4702, - "step": 2601 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2103, - "step": 2602 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1688, - "step": 2603 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0882, - "step": 2604 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2605 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2606 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3721, - "step": 2607 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5558, - "step": 2608 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.251, - "step": 2609 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5517, - "step": 2610 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5841, - "step": 2611 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3849, - "step": 2612 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5556, - "step": 2613 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4158, - "step": 2614 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9386, - "step": 2615 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6376, - "step": 2616 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7976, - "step": 2617 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.291, - "step": 2618 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8779, - "step": 2619 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8159, - "step": 2620 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1198, - "step": 2621 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9783, - "step": 2622 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0635, - "step": 2623 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8539, - "step": 2624 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5904, - "step": 2625 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7561, - "step": 2626 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3628, - "step": 2627 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.2452, - "step": 2628 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8438, - "step": 2629 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7726, - "step": 2630 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8356, - "step": 2631 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6763, - "step": 2632 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9104, - "step": 2633 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1199, - "step": 2634 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4833, - "step": 2635 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6164, - "step": 2636 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2284, - "step": 2637 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8304, - "step": 2638 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7706, - "step": 2639 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.638, - "step": 2640 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9576, - "step": 2641 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0828, - "step": 2642 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5206, - "step": 2643 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7744, - "step": 2644 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5361, - "step": 2645 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9041, - "step": 2646 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6145, - "step": 2647 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9121, - "step": 2648 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1322, - "step": 2649 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1881, - "step": 2650 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6446, - "step": 2651 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9137, - "step": 2652 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4681, - "step": 2653 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9891, - "step": 2654 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3255, - "step": 2655 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3909, - "step": 2656 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6869, - "step": 2657 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0611, - "step": 2658 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3314, - "step": 2659 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 2660 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5881, - "step": 2661 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8889, - "step": 2662 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3336, - "step": 2663 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1282, - "step": 2664 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.158, - "step": 2665 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1773, - "step": 2666 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9617, - "step": 2667 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5576, - "step": 2668 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8284, - "step": 2669 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5936, - "step": 2670 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0931, - "step": 2671 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.07, - "step": 2672 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.778, - "step": 2673 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7786, - "step": 2674 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1279, - "step": 2675 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.463, - "step": 2676 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2192, - "step": 2677 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4147, - "step": 2678 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9957, - "step": 2679 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8919, - "step": 2680 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1558, - "step": 2681 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7069, - "step": 2682 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.487, - "step": 2683 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7098, - "step": 2684 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1032, - "step": 2685 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9937, - "step": 2686 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0677, - "step": 2687 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.644, - "step": 2688 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5099, - "step": 2689 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2690 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7491, - "step": 2691 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.605, - "step": 2692 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1904, - "step": 2693 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 2694 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3251, - "step": 2695 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.315, - "step": 2696 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3052, - "step": 2697 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2122, - "step": 2698 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9462, - "step": 2699 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3221, - "step": 2700 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3125, - "step": 2701 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.938, - "step": 2702 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0983, - "step": 2703 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8028, - "step": 2704 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4752, - "step": 2705 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.16, - "step": 2706 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2429, - "step": 2707 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.623, - "step": 2708 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9595, - "step": 2709 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5444, - "step": 2710 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6245, - "step": 2711 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.919, - "step": 2712 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7332, - "step": 2713 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0067, - "step": 2714 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6957, - "step": 2715 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.994, - "step": 2716 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7562, - "step": 2717 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6882, - "step": 2718 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8732, - "step": 2719 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6496, - "step": 2720 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4841, - "step": 2721 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4927, - "step": 2722 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7437, - "step": 2723 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9469, - "step": 2724 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1481, - "step": 2725 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7762, - "step": 2726 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8806, - "step": 2727 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8352, - "step": 2728 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9488, - "step": 2729 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1679, - "step": 2730 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2412, - "step": 2731 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6839, - "step": 2732 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2733 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 2734 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8754, - "step": 2735 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9007, - "step": 2736 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9134, - "step": 2737 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2738 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9293, - "step": 2739 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0489, - "step": 2740 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4197, - "step": 2741 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3667, - "step": 2742 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8059, - "step": 2743 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.024, - "step": 2744 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0756, - "step": 2745 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0596, - "step": 2746 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1416, - "step": 2747 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1271, - "step": 2748 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1452, - "step": 2749 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9527, - "step": 2750 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9189, - "step": 2751 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4364, - "step": 2752 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 2753 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4034, - "step": 2754 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6678, - "step": 2755 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 2756 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7751, - "step": 2757 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0164, - "step": 2758 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5796, - "step": 2759 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7851, - "step": 2760 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1784, - "step": 2761 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7417, - "step": 2762 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4268, - "step": 2763 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6919, - "step": 2764 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1838, - "step": 2765 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5592, - "step": 2766 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.425, - "step": 2767 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.848, - "step": 2768 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5683, - "step": 2769 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0262, - "step": 2770 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8063, - "step": 2771 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6139, - "step": 2772 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3478, - "step": 2773 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1811, - "step": 2774 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4519, - "step": 2775 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0071, - "step": 2776 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7872, - "step": 2777 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2263, - "step": 2778 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8923, - "step": 2779 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2997, - "step": 2780 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6857, - "step": 2781 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8874, - "step": 2782 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8203, - "step": 2783 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9583, - "step": 2784 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0814, - "step": 2785 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.419, - "step": 2786 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3063, - "step": 2787 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1285, - "step": 2788 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0039, - "step": 2789 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.671, - "step": 2790 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5452, - "step": 2791 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3885, - "step": 2792 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6282, - "step": 2793 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5913, - "step": 2794 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6189, - "step": 2795 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2968, - "step": 2796 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 2797 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9897, - "step": 2798 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8193, - "step": 2799 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7042, - "step": 2800 - }, - { - "epoch": 0.02, - "eval_loss": 6.604581832885742, - "eval_runtime": 22.516, - "eval_samples_per_second": 2.221, - "eval_steps_per_second": 1.11, - "step": 2800 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.006761569976806, - "step": 2800 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 4.660001608148582e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-2800/training_args.bin b/checkpoint-2800/training_args.bin deleted file mode 100644 index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000 --- a/checkpoint-2800/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f -size 6011 diff --git a/checkpoint-3000/README.md b/checkpoint-3000/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-3000/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json deleted file mode 100644 index 2adcd7d22e9c842efe5230fdbfc7ae7a84aededb..0000000000000000000000000000000000000000 --- a/checkpoint-3000/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "q_proj", - "o_proj", - "k_proj", - "gate_proj", - "down_proj", - "v_proj", - "up_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-3000/adapter_model.bin b/checkpoint-3000/adapter_model.bin deleted file mode 100644 index a07c42932c8213b6199c8b6020b7690682ce65df..0000000000000000000000000000000000000000 --- a/checkpoint-3000/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8300649a3cb3257506bd84a299764cdbbadb65ebf8c06576deb99c0b813044d3 -size 871609293 diff --git a/checkpoint-3000/added_tokens.json b/checkpoint-3000/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-3000/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt deleted file mode 100644 index 7119b9ca6c227869b6474a455d51eca73d20a640..0000000000000000000000000000000000000000 --- a/checkpoint-3000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:528ae8d418d6bd9f8defca665070c7a898fc988f536cf557953fbc1e2798def6 -size 873872799 diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth deleted file mode 100644 index 2b5f562ec2b39a00e12fe386e08ac946a222473c..0000000000000000000000000000000000000000 --- a/checkpoint-3000/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:236c5f2a4bdbbbac5308c611e909c39eb694692e85b45c57d4e25a5aed8d3b27 -size 14511 diff --git a/checkpoint-3000/special_tokens_map.json b/checkpoint-3000/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-3000/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-3000/tokenizer.model b/checkpoint-3000/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-3000/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-3000/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json deleted file mode 100644 index c68adf599ac1d2d3971d2a6c3ade162b6a7879e0..0000000000000000000000000000000000000000 --- a/checkpoint-3000/trainer_state.json +++ /dev/null @@ -1,18295 +0,0 @@ -{ - "best_metric": 6.580160140991211, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-2200", - "epoch": 0.022916507524253303, - "global_step": 3000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8099, - "step": 2001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0567, - "step": 2002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2981, - "step": 2003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 2004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.139, - "step": 2005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.903, - "step": 2006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2182, - "step": 2007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2347, - "step": 2008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8383, - "step": 2009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0211, - "step": 2010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2572, - "step": 2011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2877, - "step": 2012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3577, - "step": 2013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2022, - "step": 2014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2722, - "step": 2015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0552, - "step": 2016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9857, - "step": 2017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0519, - "step": 2018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7118, - "step": 2019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4465, - "step": 2020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3009, - "step": 2021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3614, - "step": 2022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3493, - "step": 2023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 2024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0416, - "step": 2025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.686, - "step": 2026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6021, - "step": 2027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4161, - "step": 2028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0029, - "step": 2029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8579, - "step": 2030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0247, - "step": 2031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4184, - "step": 2032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4962, - "step": 2033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5137, - "step": 2034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6692, - "step": 2035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7161, - "step": 2036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.617, - "step": 2037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.413, - "step": 2038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3056, - "step": 2039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9441, - "step": 2040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9943, - "step": 2041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5703, - "step": 2042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1881, - "step": 2043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5763, - "step": 2044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6389, - "step": 2045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1717, - "step": 2046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5482, - "step": 2047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9469, - "step": 2048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7685, - "step": 2049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1381, - "step": 2050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6961, - "step": 2051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6425, - "step": 2052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5354, - "step": 2053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2404, - "step": 2054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1556, - "step": 2055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7133, - "step": 2056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8166, - "step": 2057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 2058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5429, - "step": 2059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0367, - "step": 2060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5386, - "step": 2061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5899, - "step": 2062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 2063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9951, - "step": 2064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8608, - "step": 2065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4735, - "step": 2066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5612, - "step": 2067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7461, - "step": 2068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5887, - "step": 2069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 2070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5589, - "step": 2071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.498, - "step": 2072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1306, - "step": 2073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3492, - "step": 2074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2311, - "step": 2075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8798, - "step": 2076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6799, - "step": 2077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5011, - "step": 2078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8892, - "step": 2079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6449, - "step": 2080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9117, - "step": 2081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1157, - "step": 2082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.196, - "step": 2083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9364, - "step": 2084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3618, - "step": 2085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3755, - "step": 2086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4564, - "step": 2087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4912, - "step": 2088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.113, - "step": 2089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0588, - "step": 2090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.668, - "step": 2091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.08, - "step": 2092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2042, - "step": 2093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4134, - "step": 2094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0456, - "step": 2095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2245, - "step": 2096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4936, - "step": 2097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5158, - "step": 2098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7077, - "step": 2100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6008, - "step": 2101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4652, - "step": 2102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.918, - "step": 2103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5819, - "step": 2104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7764, - "step": 2105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0525, - "step": 2106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5359, - "step": 2107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4925, - "step": 2108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4857, - "step": 2109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9445, - "step": 2110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8494, - "step": 2111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1513, - "step": 2112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2552, - "step": 2113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8571, - "step": 2115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5968, - "step": 2116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8806, - "step": 2117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4641, - "step": 2118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6039, - "step": 2119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1379, - "step": 2120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6688, - "step": 2121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.293, - "step": 2122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5664, - "step": 2123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0825, - "step": 2124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9788, - "step": 2125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9641, - "step": 2126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7799, - "step": 2127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0619, - "step": 2128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0022, - "step": 2129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8022, - "step": 2130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5301, - "step": 2131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.681, - "step": 2132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7362, - "step": 2133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5462, - "step": 2134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2356, - "step": 2135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3646, - "step": 2137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8765, - "step": 2138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6487, - "step": 2139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9622, - "step": 2140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1761, - "step": 2141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0371, - "step": 2143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7869, - "step": 2144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3725, - "step": 2145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8894, - "step": 2146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6083, - "step": 2147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4451, - "step": 2148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1149, - "step": 2149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8058, - "step": 2150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1308, - "step": 2151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1447, - "step": 2152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.208, - "step": 2153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5193, - "step": 2154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7729, - "step": 2155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5019, - "step": 2156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6092, - "step": 2157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1853, - "step": 2158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7, - "step": 2159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1638, - "step": 2160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.762, - "step": 2161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7455, - "step": 2162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9372, - "step": 2163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4569, - "step": 2164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6705, - "step": 2165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1988, - "step": 2166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2526, - "step": 2167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9066, - "step": 2168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1365, - "step": 2169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3422, - "step": 2170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2691, - "step": 2171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9008, - "step": 2172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2555, - "step": 2173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0886, - "step": 2174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0369, - "step": 2175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 2176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2567, - "step": 2177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 2178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5383, - "step": 2179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4797, - "step": 2180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0163, - "step": 2181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2658, - "step": 2182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1337, - "step": 2183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3287, - "step": 2184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7874, - "step": 2185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7153, - "step": 2186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7037, - "step": 2187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4412, - "step": 2188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3912, - "step": 2189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.034, - "step": 2190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4697, - "step": 2191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6243, - "step": 2192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1133, - "step": 2193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9005, - "step": 2194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7386, - "step": 2195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4169, - "step": 2196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8139, - "step": 2197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3012, - "step": 2198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8223, - "step": 2199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3757, - "step": 2200 - }, - { - "epoch": 0.02, - "eval_loss": 6.580160140991211, - "eval_runtime": 22.4971, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.111, - "step": 2200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.755114164352417, - "step": 2200 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5282, - "step": 2201 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2478, - "step": 2202 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.916, - "step": 2203 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5069, - "step": 2204 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5952, - "step": 2205 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5059, - "step": 2206 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7434, - "step": 2207 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.625, - "step": 2208 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1674, - "step": 2209 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3937, - "step": 2210 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8783, - "step": 2211 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5263, - "step": 2212 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7887, - "step": 2213 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8911, - "step": 2214 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7211, - "step": 2215 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.089, - "step": 2216 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6373, - "step": 2217 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7728, - "step": 2218 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6957, - "step": 2219 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.43, - "step": 2220 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9673, - "step": 2221 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8942, - "step": 2222 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2893, - "step": 2223 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1505, - "step": 2224 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3702, - "step": 2225 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1731, - "step": 2226 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.997, - "step": 2227 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9531, - "step": 2228 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0748, - "step": 2229 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0642, - "step": 2230 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2231 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2265, - "step": 2232 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6461, - "step": 2233 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.064, - "step": 2234 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1414, - "step": 2235 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5375, - "step": 2236 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6348, - "step": 2237 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9975, - "step": 2238 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5242, - "step": 2239 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3179, - "step": 2240 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6054, - "step": 2241 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1832, - "step": 2242 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0572, - "step": 2243 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2049, - "step": 2244 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6348, - "step": 2245 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.67, - "step": 2246 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.5627, - "step": 2247 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1851, - "step": 2248 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6792, - "step": 2249 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6344, - "step": 2250 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7603, - "step": 2251 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7737, - "step": 2252 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5323, - "step": 2253 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4059, - "step": 2254 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9343, - "step": 2255 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0156, - "step": 2256 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1851, - "step": 2257 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.44, - "step": 2258 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9079, - "step": 2259 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2260 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 12.3777, - "step": 2261 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1265, - "step": 2262 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1428, - "step": 2263 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8072, - "step": 2264 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.911, - "step": 2265 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9453, - "step": 2266 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0168, - "step": 2267 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2098, - "step": 2268 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2269 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8449, - "step": 2270 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.394, - "step": 2271 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7642, - "step": 2272 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5555, - "step": 2273 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3576, - "step": 2274 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.386, - "step": 2275 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6677, - "step": 2276 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2385, - "step": 2277 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8074, - "step": 2278 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2963, - "step": 2279 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3612, - "step": 2280 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1837, - "step": 2281 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5882, - "step": 2282 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0968, - "step": 2283 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2376, - "step": 2284 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3835, - "step": 2285 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 2286 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.36, - "step": 2287 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0121, - "step": 2288 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0144, - "step": 2289 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6807, - "step": 2290 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8854, - "step": 2291 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1727, - "step": 2292 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.533, - "step": 2293 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9793, - "step": 2294 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.538, - "step": 2295 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 2296 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.163, - "step": 2297 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1045, - "step": 2298 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0209, - "step": 2299 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9728, - "step": 2300 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8902, - "step": 2301 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3075, - "step": 2302 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.2194, - "step": 2303 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7375, - "step": 2304 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3863, - "step": 2305 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1317, - "step": 2306 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1878, - "step": 2307 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6124, - "step": 2308 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8843, - "step": 2309 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3988, - "step": 2310 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3523, - "step": 2311 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5766, - "step": 2312 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9096, - "step": 2313 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9315, - "step": 2314 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4044, - "step": 2315 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6206, - "step": 2316 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2429, - "step": 2317 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0383, - "step": 2318 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4282, - "step": 2319 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8973, - "step": 2320 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1771, - "step": 2321 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.624, - "step": 2322 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5197, - "step": 2323 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7313, - "step": 2324 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8234, - "step": 2325 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1702, - "step": 2326 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.536, - "step": 2327 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1904, - "step": 2328 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2077, - "step": 2329 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.891, - "step": 2330 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6784, - "step": 2331 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6611, - "step": 2332 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3402, - "step": 2333 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 11.1523, - "step": 2334 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5547, - "step": 2335 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3485, - "step": 2336 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8289, - "step": 2337 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2558, - "step": 2338 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1794, - "step": 2339 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8782, - "step": 2340 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.415, - "step": 2341 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5257, - "step": 2342 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4751, - "step": 2343 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2344 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 2345 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6307, - "step": 2346 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1487, - "step": 2347 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 2348 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6988, - "step": 2349 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1299, - "step": 2350 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9938, - "step": 2351 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4176, - "step": 2352 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0184, - "step": 2353 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2779, - "step": 2354 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0162, - "step": 2355 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 2356 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5505, - "step": 2357 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6445, - "step": 2358 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6513, - "step": 2359 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8503, - "step": 2360 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1817, - "step": 2361 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4376, - "step": 2362 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1351, - "step": 2363 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7566, - "step": 2364 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.626, - "step": 2365 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5818, - "step": 2366 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3033, - "step": 2367 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9289, - "step": 2368 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0301, - "step": 2369 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4713, - "step": 2370 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0931, - "step": 2371 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5812, - "step": 2372 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2272, - "step": 2373 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5174, - "step": 2374 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1849, - "step": 2375 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7496, - "step": 2376 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.776, - "step": 2377 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3555, - "step": 2378 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.688, - "step": 2379 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0143, - "step": 2380 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7788, - "step": 2381 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7772, - "step": 2382 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6875, - "step": 2383 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9944, - "step": 2384 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8363, - "step": 2385 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7276, - "step": 2386 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4892, - "step": 2387 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1083, - "step": 2388 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.834, - "step": 2389 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8406, - "step": 2390 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1168, - "step": 2391 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2535, - "step": 2392 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9025, - "step": 2393 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4481, - "step": 2394 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7631, - "step": 2395 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2051, - "step": 2396 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7816, - "step": 2397 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2398 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1125, - "step": 2399 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5952, - "step": 2400 - }, - { - "epoch": 0.02, - "eval_loss": 6.616010665893555, - "eval_runtime": 22.4801, - "eval_samples_per_second": 2.224, - "eval_steps_per_second": 1.112, - "step": 2400 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.427501640319824, - "step": 2400 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6081, - "step": 2401 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2359, - "step": 2402 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2628, - "step": 2403 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8465, - "step": 2404 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6746, - "step": 2405 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1066, - "step": 2406 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4981, - "step": 2407 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9194, - "step": 2408 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.239, - "step": 2409 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2410 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4597, - "step": 2411 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 2412 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4122, - "step": 2413 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7398, - "step": 2414 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5009, - "step": 2415 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2757, - "step": 2416 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4202, - "step": 2417 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.108, - "step": 2418 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3303, - "step": 2419 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4671, - "step": 2420 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5768, - "step": 2421 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9172, - "step": 2422 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7606, - "step": 2423 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0745, - "step": 2424 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2907, - "step": 2425 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6956, - "step": 2426 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4414, - "step": 2427 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9823, - "step": 2428 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6486, - "step": 2429 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5173, - "step": 2430 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 2431 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9405, - "step": 2432 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4226, - "step": 2433 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4334, - "step": 2434 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9136, - "step": 2435 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6707, - "step": 2436 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6107, - "step": 2437 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5296, - "step": 2438 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0736, - "step": 2439 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4398, - "step": 2440 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5669, - "step": 2441 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.303, - "step": 2442 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2993, - "step": 2443 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9063, - "step": 2444 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3015, - "step": 2445 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3117, - "step": 2446 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6228, - "step": 2447 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6216, - "step": 2448 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6188, - "step": 2449 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8996, - "step": 2450 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5802, - "step": 2451 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2603, - "step": 2452 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0921, - "step": 2453 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9377, - "step": 2454 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0934, - "step": 2455 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9832, - "step": 2456 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1084, - "step": 2457 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2592, - "step": 2458 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8545, - "step": 2459 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4335, - "step": 2460 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5591, - "step": 2461 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.284, - "step": 2462 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8171, - "step": 2463 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 2464 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1355, - "step": 2465 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6885, - "step": 2466 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.311, - "step": 2467 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.72, - "step": 2468 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.007, - "step": 2469 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2275, - "step": 2470 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.129, - "step": 2471 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9531, - "step": 2472 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7327, - "step": 2473 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5001, - "step": 2474 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9443, - "step": 2475 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6005, - "step": 2476 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5551, - "step": 2477 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3044, - "step": 2478 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6257, - "step": 2479 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5029, - "step": 2480 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3717, - "step": 2481 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5226, - "step": 2482 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2921, - "step": 2483 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7169, - "step": 2484 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2185, - "step": 2485 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5393, - "step": 2486 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0286, - "step": 2487 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3738, - "step": 2488 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2249, - "step": 2489 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7828, - "step": 2490 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.87, - "step": 2491 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.681, - "step": 2492 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5446, - "step": 2493 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0769, - "step": 2494 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3587, - "step": 2495 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9188, - "step": 2496 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9357, - "step": 2497 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3449, - "step": 2498 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2753, - "step": 2499 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3077, - "step": 2500 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0085, - "step": 2501 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5436, - "step": 2502 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9096, - "step": 2503 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7288, - "step": 2504 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7344, - "step": 2505 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6385, - "step": 2506 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6713, - "step": 2507 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6065, - "step": 2508 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3456, - "step": 2509 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1633, - "step": 2510 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5557, - "step": 2511 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7075, - "step": 2512 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4954, - "step": 2513 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5104, - "step": 2514 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5706, - "step": 2515 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7479, - "step": 2516 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7042, - "step": 2517 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9569, - "step": 2518 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7846, - "step": 2519 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.749, - "step": 2520 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5868, - "step": 2521 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3957, - "step": 2522 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2594, - "step": 2523 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 2524 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.023, - "step": 2525 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0423, - "step": 2526 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1901, - "step": 2527 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0025, - "step": 2528 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0252, - "step": 2529 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 2530 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6864, - "step": 2531 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1174, - "step": 2532 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.46, - "step": 2533 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3714, - "step": 2534 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1206, - "step": 2535 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3734, - "step": 2536 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2537 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0124, - "step": 2538 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2105, - "step": 2539 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 2540 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1163, - "step": 2541 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5255, - "step": 2542 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2058, - "step": 2543 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7425, - "step": 2544 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3621, - "step": 2545 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7541, - "step": 2546 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9611, - "step": 2547 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3031, - "step": 2548 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1756, - "step": 2549 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6146, - "step": 2550 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1434, - "step": 2551 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0786, - "step": 2552 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9498, - "step": 2553 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8681, - "step": 2554 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5898, - "step": 2555 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7522, - "step": 2556 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3648, - "step": 2557 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8895, - "step": 2558 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9903, - "step": 2559 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1113, - "step": 2560 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6541, - "step": 2561 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8563, - "step": 2562 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0685, - "step": 2563 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.59, - "step": 2564 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0927, - "step": 2565 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3792, - "step": 2566 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.208, - "step": 2567 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9275, - "step": 2568 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.25, - "step": 2569 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9524, - "step": 2570 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.556, - "step": 2571 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6414, - "step": 2572 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2573 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4529, - "step": 2574 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9764, - "step": 2575 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1835, - "step": 2576 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.438, - "step": 2577 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.834, - "step": 2578 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8879, - "step": 2579 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 2580 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 2581 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7319, - "step": 2582 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3287, - "step": 2583 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3322, - "step": 2584 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0278, - "step": 2585 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5355, - "step": 2586 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2016, - "step": 2587 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8335, - "step": 2588 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.721, - "step": 2589 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4628, - "step": 2590 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7896, - "step": 2591 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7226, - "step": 2592 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5834, - "step": 2593 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8944, - "step": 2594 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1519, - "step": 2595 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 2596 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9751, - "step": 2597 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1408, - "step": 2598 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2469, - "step": 2599 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3136, - "step": 2600 - }, - { - "epoch": 0.02, - "eval_loss": 6.580307483673096, - "eval_runtime": 22.5866, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 2600 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.17715097402597402, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.21428571428571427, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.0, - "mmlu_loss": 3.684196367263794, - "step": 2600 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4702, - "step": 2601 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2103, - "step": 2602 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1688, - "step": 2603 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0882, - "step": 2604 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2605 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2606 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3721, - "step": 2607 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5558, - "step": 2608 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.251, - "step": 2609 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5517, - "step": 2610 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5841, - "step": 2611 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3849, - "step": 2612 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5556, - "step": 2613 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4158, - "step": 2614 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9386, - "step": 2615 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6376, - "step": 2616 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7976, - "step": 2617 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.291, - "step": 2618 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8779, - "step": 2619 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8159, - "step": 2620 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1198, - "step": 2621 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9783, - "step": 2622 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0635, - "step": 2623 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8539, - "step": 2624 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5904, - "step": 2625 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7561, - "step": 2626 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3628, - "step": 2627 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.2452, - "step": 2628 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8438, - "step": 2629 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7726, - "step": 2630 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8356, - "step": 2631 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6763, - "step": 2632 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9104, - "step": 2633 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1199, - "step": 2634 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4833, - "step": 2635 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6164, - "step": 2636 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2284, - "step": 2637 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8304, - "step": 2638 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7706, - "step": 2639 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.638, - "step": 2640 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9576, - "step": 2641 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0828, - "step": 2642 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5206, - "step": 2643 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7744, - "step": 2644 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5361, - "step": 2645 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9041, - "step": 2646 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6145, - "step": 2647 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9121, - "step": 2648 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1322, - "step": 2649 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1881, - "step": 2650 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6446, - "step": 2651 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9137, - "step": 2652 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4681, - "step": 2653 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9891, - "step": 2654 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3255, - "step": 2655 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3909, - "step": 2656 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6869, - "step": 2657 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0611, - "step": 2658 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3314, - "step": 2659 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 2660 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5881, - "step": 2661 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8889, - "step": 2662 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3336, - "step": 2663 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1282, - "step": 2664 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.158, - "step": 2665 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1773, - "step": 2666 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9617, - "step": 2667 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5576, - "step": 2668 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8284, - "step": 2669 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5936, - "step": 2670 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0931, - "step": 2671 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.07, - "step": 2672 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.778, - "step": 2673 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7786, - "step": 2674 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1279, - "step": 2675 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.463, - "step": 2676 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2192, - "step": 2677 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4147, - "step": 2678 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9957, - "step": 2679 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8919, - "step": 2680 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1558, - "step": 2681 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7069, - "step": 2682 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.487, - "step": 2683 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7098, - "step": 2684 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1032, - "step": 2685 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9937, - "step": 2686 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0677, - "step": 2687 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.644, - "step": 2688 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5099, - "step": 2689 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2690 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7491, - "step": 2691 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.605, - "step": 2692 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1904, - "step": 2693 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 2694 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3251, - "step": 2695 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.315, - "step": 2696 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3052, - "step": 2697 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2122, - "step": 2698 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9462, - "step": 2699 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3221, - "step": 2700 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3125, - "step": 2701 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.938, - "step": 2702 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0983, - "step": 2703 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8028, - "step": 2704 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4752, - "step": 2705 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.16, - "step": 2706 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2429, - "step": 2707 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.623, - "step": 2708 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9595, - "step": 2709 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5444, - "step": 2710 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6245, - "step": 2711 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.919, - "step": 2712 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7332, - "step": 2713 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0067, - "step": 2714 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6957, - "step": 2715 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.994, - "step": 2716 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7562, - "step": 2717 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6882, - "step": 2718 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8732, - "step": 2719 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6496, - "step": 2720 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4841, - "step": 2721 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4927, - "step": 2722 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7437, - "step": 2723 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9469, - "step": 2724 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1481, - "step": 2725 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7762, - "step": 2726 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8806, - "step": 2727 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8352, - "step": 2728 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9488, - "step": 2729 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1679, - "step": 2730 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2412, - "step": 2731 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6839, - "step": 2732 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2733 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 2734 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8754, - "step": 2735 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9007, - "step": 2736 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9134, - "step": 2737 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2738 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9293, - "step": 2739 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0489, - "step": 2740 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4197, - "step": 2741 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3667, - "step": 2742 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8059, - "step": 2743 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.024, - "step": 2744 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0756, - "step": 2745 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0596, - "step": 2746 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1416, - "step": 2747 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1271, - "step": 2748 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1452, - "step": 2749 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9527, - "step": 2750 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9189, - "step": 2751 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4364, - "step": 2752 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 2753 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4034, - "step": 2754 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6678, - "step": 2755 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 2756 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7751, - "step": 2757 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0164, - "step": 2758 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5796, - "step": 2759 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7851, - "step": 2760 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1784, - "step": 2761 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7417, - "step": 2762 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4268, - "step": 2763 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6919, - "step": 2764 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1838, - "step": 2765 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5592, - "step": 2766 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.425, - "step": 2767 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.848, - "step": 2768 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5683, - "step": 2769 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0262, - "step": 2770 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8063, - "step": 2771 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6139, - "step": 2772 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3478, - "step": 2773 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1811, - "step": 2774 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4519, - "step": 2775 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0071, - "step": 2776 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7872, - "step": 2777 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2263, - "step": 2778 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8923, - "step": 2779 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2997, - "step": 2780 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6857, - "step": 2781 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8874, - "step": 2782 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8203, - "step": 2783 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9583, - "step": 2784 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0814, - "step": 2785 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.419, - "step": 2786 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3063, - "step": 2787 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1285, - "step": 2788 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0039, - "step": 2789 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.671, - "step": 2790 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5452, - "step": 2791 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3885, - "step": 2792 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6282, - "step": 2793 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5913, - "step": 2794 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6189, - "step": 2795 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2968, - "step": 2796 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 2797 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9897, - "step": 2798 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8193, - "step": 2799 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7042, - "step": 2800 - }, - { - "epoch": 0.02, - "eval_loss": 6.604581832885742, - "eval_runtime": 22.516, - "eval_samples_per_second": 2.221, - "eval_steps_per_second": 1.11, - "step": 2800 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.006761569976806, - "step": 2800 - }, - { - "epoch": 0.02, - "step": 2800, - "total_flos": 4.660001608148582e+16, - "train_loss": 6.312225336258395, - "train_runtime": 7855.0688, - "train_samples_per_second": 3.819, - "train_steps_per_second": 3.819 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0757, - "step": 2801 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8944, - "step": 2802 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8365, - "step": 2803 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.873, - "step": 2804 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3793, - "step": 2805 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1923, - "step": 2806 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2306, - "step": 2807 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4439, - "step": 2808 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3377, - "step": 2809 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8737, - "step": 2810 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4191, - "step": 2811 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.892, - "step": 2812 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 2813 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 2814 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6912, - "step": 2815 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9033, - "step": 2816 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4433, - "step": 2817 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7591, - "step": 2818 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4458, - "step": 2819 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3721, - "step": 2820 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4111, - "step": 2821 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0999, - "step": 2822 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5154, - "step": 2823 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1967, - "step": 2824 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8437, - "step": 2825 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.759, - "step": 2826 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6223, - "step": 2827 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3442, - "step": 2828 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1991, - "step": 2829 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5302, - "step": 2830 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1725, - "step": 2831 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8107, - "step": 2832 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7891, - "step": 2833 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5592, - "step": 2834 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8792, - "step": 2835 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2387, - "step": 2836 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9369, - "step": 2837 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2098, - "step": 2838 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6645, - "step": 2839 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2349, - "step": 2840 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8613, - "step": 2841 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5482, - "step": 2842 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5438, - "step": 2843 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6133, - "step": 2844 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9983, - "step": 2845 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8706, - "step": 2846 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9685, - "step": 2847 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.04, - "step": 2848 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6498, - "step": 2849 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6786, - "step": 2850 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.789, - "step": 2851 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1116, - "step": 2852 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7085, - "step": 2853 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1083, - "step": 2854 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0795, - "step": 2855 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8677, - "step": 2856 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1663, - "step": 2857 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5379, - "step": 2858 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4923, - "step": 2859 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1456, - "step": 2860 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1686, - "step": 2861 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4153, - "step": 2862 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.17, - "step": 2863 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 2864 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7106, - "step": 2865 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.463, - "step": 2866 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.614, - "step": 2867 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1451, - "step": 2868 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6817, - "step": 2869 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9577, - "step": 2870 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6171, - "step": 2871 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5129, - "step": 2872 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3386, - "step": 2873 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1176, - "step": 2874 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9075, - "step": 2875 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.667, - "step": 2876 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8097, - "step": 2877 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7707, - "step": 2878 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7571, - "step": 2879 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0732, - "step": 2880 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5586, - "step": 2881 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8045, - "step": 2882 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4442, - "step": 2883 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.418, - "step": 2884 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7253, - "step": 2885 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4227, - "step": 2886 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9878, - "step": 2887 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 2888 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1715, - "step": 2889 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1207, - "step": 2890 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0396, - "step": 2891 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 2892 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2893 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4087, - "step": 2894 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4509, - "step": 2895 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8165, - "step": 2896 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9842, - "step": 2897 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.574, - "step": 2898 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4512, - "step": 2899 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9904, - "step": 2900 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6141, - "step": 2901 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9528, - "step": 2902 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9028, - "step": 2903 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3868, - "step": 2904 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0461, - "step": 2905 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5161, - "step": 2906 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.107, - "step": 2907 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7174, - "step": 2908 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7671, - "step": 2909 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6803, - "step": 2910 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5357, - "step": 2911 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6898, - "step": 2912 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8564, - "step": 2913 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1457, - "step": 2914 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3408, - "step": 2915 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6095, - "step": 2916 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.866, - "step": 2917 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7324, - "step": 2918 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4026, - "step": 2919 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1467, - "step": 2920 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2496, - "step": 2921 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5265, - "step": 2922 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8246, - "step": 2923 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5775, - "step": 2924 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2886, - "step": 2925 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3076, - "step": 2926 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7975, - "step": 2927 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9898, - "step": 2928 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7256, - "step": 2929 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7971, - "step": 2930 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5884, - "step": 2931 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0749, - "step": 2932 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6183, - "step": 2933 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0019, - "step": 2934 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1706, - "step": 2935 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4075, - "step": 2936 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4092, - "step": 2937 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9424, - "step": 2938 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9025, - "step": 2939 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7751, - "step": 2940 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.988, - "step": 2941 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1208, - "step": 2942 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1999, - "step": 2943 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2129, - "step": 2944 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 2945 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 2946 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8246, - "step": 2947 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4876, - "step": 2948 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7158, - "step": 2949 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3611, - "step": 2950 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9919, - "step": 2951 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4136, - "step": 2952 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.801, - "step": 2953 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6503, - "step": 2954 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.553, - "step": 2955 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3536, - "step": 2956 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8516, - "step": 2957 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9344, - "step": 2958 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8727, - "step": 2959 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9155, - "step": 2960 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9684, - "step": 2961 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0399, - "step": 2962 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4298, - "step": 2963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4559, - "step": 2964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0361, - "step": 2965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0081, - "step": 2966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6698, - "step": 2967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3355, - "step": 2968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7555, - "step": 2969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.29, - "step": 2970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4079, - "step": 2971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0588, - "step": 2972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2956, - "step": 2973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7318, - "step": 2974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8846, - "step": 2975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5159, - "step": 2976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7629, - "step": 2977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2039, - "step": 2978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.311, - "step": 2979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9786, - "step": 2980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7138, - "step": 2981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4827, - "step": 2982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5264, - "step": 2983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8153, - "step": 2984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3512, - "step": 2985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1515, - "step": 2986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1689, - "step": 2987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8027, - "step": 2988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7472, - "step": 2989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0236, - "step": 2990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1741, - "step": 2991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8884, - "step": 2992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3912, - "step": 2993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2109, - "step": 2994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1693, - "step": 2995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8166, - "step": 2996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4902, - "step": 2997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3921, - "step": 2998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8787, - "step": 2999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1304, - "step": 3000 - }, - { - "epoch": 0.02, - "eval_loss": 6.659167289733887, - "eval_runtime": 22.4512, - "eval_samples_per_second": 2.227, - "eval_steps_per_second": 1.114, - "step": 3000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.973116703033447, - "step": 3000 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 4.959157820502835e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin deleted file mode 100644 index 53a16291359ea01b885cc36189679e385fee54a8..0000000000000000000000000000000000000000 --- a/checkpoint-3000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2f399ab69470e06aaa321f2990a85c1505da75b9e960c095081ae355addfd1d -size 6011 diff --git a/checkpoint-3200/README.md b/checkpoint-3200/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-3200/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-3200/adapter_config.json b/checkpoint-3200/adapter_config.json deleted file mode 100644 index 2adcd7d22e9c842efe5230fdbfc7ae7a84aededb..0000000000000000000000000000000000000000 --- a/checkpoint-3200/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "q_proj", - "o_proj", - "k_proj", - "gate_proj", - "down_proj", - "v_proj", - "up_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-3200/adapter_model.bin b/checkpoint-3200/adapter_model.bin deleted file mode 100644 index 0b86abcee9474f457ab414bad9298942abcb3ba0..0000000000000000000000000000000000000000 --- a/checkpoint-3200/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64388cba943163b96b2a1ca150029bdf9bdf15cd06d37f772e1d80a98c46e956 -size 871609293 diff --git a/checkpoint-3200/adapter_model/adapter_model/README.md b/checkpoint-3200/adapter_model/adapter_model/README.md deleted file mode 100644 index 5f53b1d1fb6c73b71b73ea36af61fcd504b1117e..0000000000000000000000000000000000000000 --- a/checkpoint-3200/adapter_model/adapter_model/README.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - -- PEFT 0.4.0 -- PEFT 0.4.0 -- PEFT 0.4.0 -- PEFT 0.4.0 -- PEFT 0.4.0 - -- PEFT 0.4.0 diff --git a/checkpoint-3200/adapter_model/adapter_model/adapter_config.json b/checkpoint-3200/adapter_model/adapter_model/adapter_config.json deleted file mode 100644 index 5f25f5a5ee344124c6acb2dc7e557d94323e30ce..0000000000000000000000000000000000000000 --- a/checkpoint-3200/adapter_model/adapter_model/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "k_proj", - "down_proj", - "up_proj", - "o_proj", - "v_proj", - "gate_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-3200/adapter_model/adapter_model/adapter_model.bin b/checkpoint-3200/adapter_model/adapter_model/adapter_model.bin deleted file mode 100644 index 04424f907c6e108ff06197959814a1580dd2cc08..0000000000000000000000000000000000000000 --- a/checkpoint-3200/adapter_model/adapter_model/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71c416d1623cb5276645beb7e7cfb4801370991a60a1b521234b7a2cb2714e07 -size 871609293 diff --git a/checkpoint-3200/added_tokens.json b/checkpoint-3200/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-3200/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-3200/optimizer.pt b/checkpoint-3200/optimizer.pt deleted file mode 100644 index 7b055cb9f2090ff5e60a46b2b59a202c22e961e0..0000000000000000000000000000000000000000 --- a/checkpoint-3200/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:73796122d5f778f1dba070c9a5954bdc974bedd5a617c8ab060b0246eab413cd -size 873873439 diff --git a/checkpoint-3200/rng_state.pth b/checkpoint-3200/rng_state.pth deleted file mode 100644 index 7cd66b8d089d482576f71d9169f5557308b3688d..0000000000000000000000000000000000000000 --- a/checkpoint-3200/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1b6d0cbcc57fd06ac5ce16be5ae8fc2396a2014de5244c2c94f9aec7736904e -size 14511 diff --git a/checkpoint-3200/scheduler.pt b/checkpoint-3200/scheduler.pt deleted file mode 100644 index f658984f48aed2698e469bac0bfffb4ef21e2885..0000000000000000000000000000000000000000 --- a/checkpoint-3200/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9ef1ca3e6fc07b43239ed034e2d8e5ae6ded24ae869473b3f8f48afde040dedc -size 627 diff --git a/checkpoint-3200/special_tokens_map.json b/checkpoint-3200/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-3200/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-3200/tokenizer.model b/checkpoint-3200/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-3200/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-3200/tokenizer_config.json b/checkpoint-3200/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-3200/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-3200/trainer_state.json b/checkpoint-3200/trainer_state.json deleted file mode 100644 index 882ba73e714478113669e9aa3b118d2072172605..0000000000000000000000000000000000000000 --- a/checkpoint-3200/trainer_state.json +++ /dev/null @@ -1,19513 +0,0 @@ -{ - "best_metric": 6.423073768615723, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-3200", - "epoch": 0.024444274692536856, - "global_step": 3200, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8099, - "step": 2001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0567, - "step": 2002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2981, - "step": 2003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 2004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.139, - "step": 2005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.903, - "step": 2006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2182, - "step": 2007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2347, - "step": 2008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8383, - "step": 2009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0211, - "step": 2010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2572, - "step": 2011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2877, - "step": 2012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3577, - "step": 2013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2022, - "step": 2014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2722, - "step": 2015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0552, - "step": 2016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9857, - "step": 2017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0519, - "step": 2018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7118, - "step": 2019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4465, - "step": 2020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3009, - "step": 2021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3614, - "step": 2022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3493, - "step": 2023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 2024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0416, - "step": 2025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.686, - "step": 2026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6021, - "step": 2027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4161, - "step": 2028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0029, - "step": 2029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8579, - "step": 2030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0247, - "step": 2031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4184, - "step": 2032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4962, - "step": 2033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5137, - "step": 2034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6692, - "step": 2035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7161, - "step": 2036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.617, - "step": 2037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.413, - "step": 2038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3056, - "step": 2039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9441, - "step": 2040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9943, - "step": 2041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5703, - "step": 2042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1881, - "step": 2043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5763, - "step": 2044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6389, - "step": 2045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1717, - "step": 2046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5482, - "step": 2047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9469, - "step": 2048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7685, - "step": 2049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1381, - "step": 2050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6961, - "step": 2051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6425, - "step": 2052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5354, - "step": 2053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2404, - "step": 2054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1556, - "step": 2055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7133, - "step": 2056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8166, - "step": 2057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 2058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5429, - "step": 2059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0367, - "step": 2060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5386, - "step": 2061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5899, - "step": 2062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 2063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9951, - "step": 2064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8608, - "step": 2065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4735, - "step": 2066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5612, - "step": 2067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7461, - "step": 2068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5887, - "step": 2069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 2070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5589, - "step": 2071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.498, - "step": 2072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1306, - "step": 2073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3492, - "step": 2074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2311, - "step": 2075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8798, - "step": 2076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6799, - "step": 2077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5011, - "step": 2078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8892, - "step": 2079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6449, - "step": 2080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9117, - "step": 2081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1157, - "step": 2082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.196, - "step": 2083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9364, - "step": 2084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3618, - "step": 2085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3755, - "step": 2086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4564, - "step": 2087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4912, - "step": 2088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.113, - "step": 2089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0588, - "step": 2090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.668, - "step": 2091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.08, - "step": 2092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2042, - "step": 2093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4134, - "step": 2094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0456, - "step": 2095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2245, - "step": 2096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4936, - "step": 2097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5158, - "step": 2098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7077, - "step": 2100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6008, - "step": 2101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4652, - "step": 2102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.918, - "step": 2103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5819, - "step": 2104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7764, - "step": 2105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0525, - "step": 2106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5359, - "step": 2107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4925, - "step": 2108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4857, - "step": 2109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9445, - "step": 2110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8494, - "step": 2111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1513, - "step": 2112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2552, - "step": 2113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8571, - "step": 2115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5968, - "step": 2116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8806, - "step": 2117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4641, - "step": 2118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6039, - "step": 2119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1379, - "step": 2120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6688, - "step": 2121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.293, - "step": 2122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5664, - "step": 2123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0825, - "step": 2124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9788, - "step": 2125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9641, - "step": 2126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7799, - "step": 2127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0619, - "step": 2128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0022, - "step": 2129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8022, - "step": 2130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5301, - "step": 2131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.681, - "step": 2132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7362, - "step": 2133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5462, - "step": 2134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2356, - "step": 2135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3646, - "step": 2137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8765, - "step": 2138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6487, - "step": 2139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9622, - "step": 2140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1761, - "step": 2141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0371, - "step": 2143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7869, - "step": 2144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3725, - "step": 2145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8894, - "step": 2146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6083, - "step": 2147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4451, - "step": 2148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1149, - "step": 2149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8058, - "step": 2150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1308, - "step": 2151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1447, - "step": 2152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.208, - "step": 2153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5193, - "step": 2154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7729, - "step": 2155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5019, - "step": 2156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6092, - "step": 2157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1853, - "step": 2158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7, - "step": 2159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1638, - "step": 2160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.762, - "step": 2161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7455, - "step": 2162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9372, - "step": 2163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4569, - "step": 2164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6705, - "step": 2165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1988, - "step": 2166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2526, - "step": 2167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9066, - "step": 2168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1365, - "step": 2169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3422, - "step": 2170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2691, - "step": 2171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9008, - "step": 2172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2555, - "step": 2173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0886, - "step": 2174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0369, - "step": 2175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 2176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2567, - "step": 2177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 2178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5383, - "step": 2179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4797, - "step": 2180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0163, - "step": 2181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2658, - "step": 2182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1337, - "step": 2183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3287, - "step": 2184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7874, - "step": 2185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7153, - "step": 2186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7037, - "step": 2187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4412, - "step": 2188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3912, - "step": 2189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.034, - "step": 2190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4697, - "step": 2191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6243, - "step": 2192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1133, - "step": 2193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9005, - "step": 2194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7386, - "step": 2195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4169, - "step": 2196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8139, - "step": 2197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3012, - "step": 2198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8223, - "step": 2199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3757, - "step": 2200 - }, - { - "epoch": 0.02, - "eval_loss": 6.580160140991211, - "eval_runtime": 22.4971, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.111, - "step": 2200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.755114164352417, - "step": 2200 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5282, - "step": 2201 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2478, - "step": 2202 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.916, - "step": 2203 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5069, - "step": 2204 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5952, - "step": 2205 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5059, - "step": 2206 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7434, - "step": 2207 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.625, - "step": 2208 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1674, - "step": 2209 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3937, - "step": 2210 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8783, - "step": 2211 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5263, - "step": 2212 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7887, - "step": 2213 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8911, - "step": 2214 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7211, - "step": 2215 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.089, - "step": 2216 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6373, - "step": 2217 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7728, - "step": 2218 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6957, - "step": 2219 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.43, - "step": 2220 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9673, - "step": 2221 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8942, - "step": 2222 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2893, - "step": 2223 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1505, - "step": 2224 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3702, - "step": 2225 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1731, - "step": 2226 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.997, - "step": 2227 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9531, - "step": 2228 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0748, - "step": 2229 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0642, - "step": 2230 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2231 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2265, - "step": 2232 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6461, - "step": 2233 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.064, - "step": 2234 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1414, - "step": 2235 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5375, - "step": 2236 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6348, - "step": 2237 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9975, - "step": 2238 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5242, - "step": 2239 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3179, - "step": 2240 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6054, - "step": 2241 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1832, - "step": 2242 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0572, - "step": 2243 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2049, - "step": 2244 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6348, - "step": 2245 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.67, - "step": 2246 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.5627, - "step": 2247 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1851, - "step": 2248 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6792, - "step": 2249 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6344, - "step": 2250 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7603, - "step": 2251 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7737, - "step": 2252 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5323, - "step": 2253 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4059, - "step": 2254 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9343, - "step": 2255 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0156, - "step": 2256 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1851, - "step": 2257 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.44, - "step": 2258 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9079, - "step": 2259 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2260 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 12.3777, - "step": 2261 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1265, - "step": 2262 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1428, - "step": 2263 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8072, - "step": 2264 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.911, - "step": 2265 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9453, - "step": 2266 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0168, - "step": 2267 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2098, - "step": 2268 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2269 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8449, - "step": 2270 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.394, - "step": 2271 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7642, - "step": 2272 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5555, - "step": 2273 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3576, - "step": 2274 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.386, - "step": 2275 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6677, - "step": 2276 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2385, - "step": 2277 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8074, - "step": 2278 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2963, - "step": 2279 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3612, - "step": 2280 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1837, - "step": 2281 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5882, - "step": 2282 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0968, - "step": 2283 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2376, - "step": 2284 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3835, - "step": 2285 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 2286 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.36, - "step": 2287 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0121, - "step": 2288 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0144, - "step": 2289 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6807, - "step": 2290 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8854, - "step": 2291 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1727, - "step": 2292 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.533, - "step": 2293 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9793, - "step": 2294 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.538, - "step": 2295 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 2296 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.163, - "step": 2297 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1045, - "step": 2298 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0209, - "step": 2299 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9728, - "step": 2300 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8902, - "step": 2301 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3075, - "step": 2302 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.2194, - "step": 2303 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7375, - "step": 2304 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3863, - "step": 2305 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1317, - "step": 2306 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1878, - "step": 2307 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6124, - "step": 2308 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8843, - "step": 2309 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3988, - "step": 2310 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3523, - "step": 2311 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5766, - "step": 2312 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9096, - "step": 2313 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9315, - "step": 2314 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4044, - "step": 2315 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6206, - "step": 2316 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2429, - "step": 2317 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0383, - "step": 2318 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4282, - "step": 2319 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8973, - "step": 2320 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1771, - "step": 2321 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.624, - "step": 2322 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5197, - "step": 2323 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7313, - "step": 2324 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8234, - "step": 2325 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1702, - "step": 2326 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.536, - "step": 2327 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1904, - "step": 2328 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2077, - "step": 2329 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.891, - "step": 2330 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6784, - "step": 2331 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6611, - "step": 2332 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3402, - "step": 2333 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 11.1523, - "step": 2334 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5547, - "step": 2335 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3485, - "step": 2336 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8289, - "step": 2337 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2558, - "step": 2338 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1794, - "step": 2339 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8782, - "step": 2340 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.415, - "step": 2341 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5257, - "step": 2342 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4751, - "step": 2343 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2344 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 2345 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6307, - "step": 2346 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1487, - "step": 2347 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 2348 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6988, - "step": 2349 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1299, - "step": 2350 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9938, - "step": 2351 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4176, - "step": 2352 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0184, - "step": 2353 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2779, - "step": 2354 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0162, - "step": 2355 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 2356 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5505, - "step": 2357 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6445, - "step": 2358 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6513, - "step": 2359 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8503, - "step": 2360 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1817, - "step": 2361 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4376, - "step": 2362 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1351, - "step": 2363 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7566, - "step": 2364 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.626, - "step": 2365 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5818, - "step": 2366 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3033, - "step": 2367 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9289, - "step": 2368 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0301, - "step": 2369 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4713, - "step": 2370 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0931, - "step": 2371 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5812, - "step": 2372 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2272, - "step": 2373 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5174, - "step": 2374 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1849, - "step": 2375 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7496, - "step": 2376 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.776, - "step": 2377 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3555, - "step": 2378 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.688, - "step": 2379 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0143, - "step": 2380 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7788, - "step": 2381 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7772, - "step": 2382 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6875, - "step": 2383 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9944, - "step": 2384 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8363, - "step": 2385 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7276, - "step": 2386 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4892, - "step": 2387 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1083, - "step": 2388 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.834, - "step": 2389 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8406, - "step": 2390 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1168, - "step": 2391 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2535, - "step": 2392 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9025, - "step": 2393 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4481, - "step": 2394 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7631, - "step": 2395 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2051, - "step": 2396 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7816, - "step": 2397 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2398 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1125, - "step": 2399 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5952, - "step": 2400 - }, - { - "epoch": 0.02, - "eval_loss": 6.616010665893555, - "eval_runtime": 22.4801, - "eval_samples_per_second": 2.224, - "eval_steps_per_second": 1.112, - "step": 2400 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.427501640319824, - "step": 2400 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6081, - "step": 2401 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2359, - "step": 2402 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2628, - "step": 2403 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8465, - "step": 2404 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6746, - "step": 2405 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1066, - "step": 2406 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4981, - "step": 2407 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9194, - "step": 2408 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.239, - "step": 2409 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2410 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4597, - "step": 2411 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 2412 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4122, - "step": 2413 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7398, - "step": 2414 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5009, - "step": 2415 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2757, - "step": 2416 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4202, - "step": 2417 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.108, - "step": 2418 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3303, - "step": 2419 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4671, - "step": 2420 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5768, - "step": 2421 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9172, - "step": 2422 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7606, - "step": 2423 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0745, - "step": 2424 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2907, - "step": 2425 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6956, - "step": 2426 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4414, - "step": 2427 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9823, - "step": 2428 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6486, - "step": 2429 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5173, - "step": 2430 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 2431 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9405, - "step": 2432 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4226, - "step": 2433 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4334, - "step": 2434 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9136, - "step": 2435 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6707, - "step": 2436 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6107, - "step": 2437 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5296, - "step": 2438 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0736, - "step": 2439 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4398, - "step": 2440 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5669, - "step": 2441 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.303, - "step": 2442 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2993, - "step": 2443 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9063, - "step": 2444 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3015, - "step": 2445 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3117, - "step": 2446 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6228, - "step": 2447 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6216, - "step": 2448 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6188, - "step": 2449 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8996, - "step": 2450 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5802, - "step": 2451 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2603, - "step": 2452 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0921, - "step": 2453 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9377, - "step": 2454 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0934, - "step": 2455 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9832, - "step": 2456 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1084, - "step": 2457 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2592, - "step": 2458 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8545, - "step": 2459 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4335, - "step": 2460 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5591, - "step": 2461 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.284, - "step": 2462 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8171, - "step": 2463 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 2464 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1355, - "step": 2465 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6885, - "step": 2466 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.311, - "step": 2467 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.72, - "step": 2468 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.007, - "step": 2469 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2275, - "step": 2470 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.129, - "step": 2471 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9531, - "step": 2472 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7327, - "step": 2473 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5001, - "step": 2474 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9443, - "step": 2475 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6005, - "step": 2476 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5551, - "step": 2477 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3044, - "step": 2478 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6257, - "step": 2479 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5029, - "step": 2480 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3717, - "step": 2481 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5226, - "step": 2482 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2921, - "step": 2483 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7169, - "step": 2484 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2185, - "step": 2485 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5393, - "step": 2486 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0286, - "step": 2487 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3738, - "step": 2488 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2249, - "step": 2489 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7828, - "step": 2490 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.87, - "step": 2491 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.681, - "step": 2492 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5446, - "step": 2493 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0769, - "step": 2494 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3587, - "step": 2495 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9188, - "step": 2496 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9357, - "step": 2497 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3449, - "step": 2498 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2753, - "step": 2499 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3077, - "step": 2500 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0085, - "step": 2501 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5436, - "step": 2502 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9096, - "step": 2503 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7288, - "step": 2504 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7344, - "step": 2505 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6385, - "step": 2506 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6713, - "step": 2507 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6065, - "step": 2508 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3456, - "step": 2509 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1633, - "step": 2510 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5557, - "step": 2511 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7075, - "step": 2512 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4954, - "step": 2513 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5104, - "step": 2514 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5706, - "step": 2515 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7479, - "step": 2516 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7042, - "step": 2517 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9569, - "step": 2518 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7846, - "step": 2519 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.749, - "step": 2520 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5868, - "step": 2521 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3957, - "step": 2522 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2594, - "step": 2523 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 2524 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.023, - "step": 2525 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0423, - "step": 2526 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1901, - "step": 2527 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0025, - "step": 2528 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0252, - "step": 2529 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 2530 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6864, - "step": 2531 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1174, - "step": 2532 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.46, - "step": 2533 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3714, - "step": 2534 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1206, - "step": 2535 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3734, - "step": 2536 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2537 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0124, - "step": 2538 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2105, - "step": 2539 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 2540 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1163, - "step": 2541 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5255, - "step": 2542 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2058, - "step": 2543 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7425, - "step": 2544 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3621, - "step": 2545 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7541, - "step": 2546 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9611, - "step": 2547 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3031, - "step": 2548 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1756, - "step": 2549 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6146, - "step": 2550 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1434, - "step": 2551 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0786, - "step": 2552 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9498, - "step": 2553 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8681, - "step": 2554 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5898, - "step": 2555 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7522, - "step": 2556 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3648, - "step": 2557 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8895, - "step": 2558 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9903, - "step": 2559 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1113, - "step": 2560 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6541, - "step": 2561 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8563, - "step": 2562 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0685, - "step": 2563 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.59, - "step": 2564 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0927, - "step": 2565 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3792, - "step": 2566 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.208, - "step": 2567 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9275, - "step": 2568 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.25, - "step": 2569 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9524, - "step": 2570 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.556, - "step": 2571 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6414, - "step": 2572 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2573 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4529, - "step": 2574 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9764, - "step": 2575 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1835, - "step": 2576 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.438, - "step": 2577 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.834, - "step": 2578 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8879, - "step": 2579 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 2580 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 2581 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7319, - "step": 2582 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3287, - "step": 2583 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3322, - "step": 2584 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0278, - "step": 2585 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5355, - "step": 2586 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2016, - "step": 2587 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8335, - "step": 2588 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.721, - "step": 2589 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4628, - "step": 2590 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7896, - "step": 2591 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7226, - "step": 2592 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5834, - "step": 2593 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8944, - "step": 2594 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1519, - "step": 2595 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 2596 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9751, - "step": 2597 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1408, - "step": 2598 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2469, - "step": 2599 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3136, - "step": 2600 - }, - { - "epoch": 0.02, - "eval_loss": 6.580307483673096, - "eval_runtime": 22.5866, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 2600 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.17715097402597402, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.21428571428571427, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.0, - "mmlu_loss": 3.684196367263794, - "step": 2600 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4702, - "step": 2601 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2103, - "step": 2602 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1688, - "step": 2603 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0882, - "step": 2604 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2605 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2606 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3721, - "step": 2607 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5558, - "step": 2608 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.251, - "step": 2609 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5517, - "step": 2610 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5841, - "step": 2611 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3849, - "step": 2612 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5556, - "step": 2613 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4158, - "step": 2614 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9386, - "step": 2615 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6376, - "step": 2616 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7976, - "step": 2617 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.291, - "step": 2618 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8779, - "step": 2619 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8159, - "step": 2620 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1198, - "step": 2621 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9783, - "step": 2622 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0635, - "step": 2623 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8539, - "step": 2624 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5904, - "step": 2625 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7561, - "step": 2626 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3628, - "step": 2627 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.2452, - "step": 2628 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8438, - "step": 2629 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7726, - "step": 2630 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8356, - "step": 2631 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6763, - "step": 2632 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9104, - "step": 2633 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1199, - "step": 2634 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4833, - "step": 2635 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6164, - "step": 2636 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2284, - "step": 2637 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8304, - "step": 2638 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7706, - "step": 2639 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.638, - "step": 2640 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9576, - "step": 2641 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0828, - "step": 2642 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5206, - "step": 2643 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7744, - "step": 2644 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5361, - "step": 2645 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9041, - "step": 2646 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6145, - "step": 2647 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9121, - "step": 2648 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1322, - "step": 2649 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1881, - "step": 2650 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6446, - "step": 2651 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9137, - "step": 2652 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4681, - "step": 2653 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9891, - "step": 2654 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3255, - "step": 2655 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3909, - "step": 2656 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6869, - "step": 2657 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0611, - "step": 2658 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3314, - "step": 2659 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 2660 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5881, - "step": 2661 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8889, - "step": 2662 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3336, - "step": 2663 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1282, - "step": 2664 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.158, - "step": 2665 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1773, - "step": 2666 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9617, - "step": 2667 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5576, - "step": 2668 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8284, - "step": 2669 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5936, - "step": 2670 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0931, - "step": 2671 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.07, - "step": 2672 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.778, - "step": 2673 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7786, - "step": 2674 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1279, - "step": 2675 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.463, - "step": 2676 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2192, - "step": 2677 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4147, - "step": 2678 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9957, - "step": 2679 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8919, - "step": 2680 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1558, - "step": 2681 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7069, - "step": 2682 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.487, - "step": 2683 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7098, - "step": 2684 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1032, - "step": 2685 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9937, - "step": 2686 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0677, - "step": 2687 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.644, - "step": 2688 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5099, - "step": 2689 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2690 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7491, - "step": 2691 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.605, - "step": 2692 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1904, - "step": 2693 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 2694 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3251, - "step": 2695 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.315, - "step": 2696 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3052, - "step": 2697 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2122, - "step": 2698 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9462, - "step": 2699 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3221, - "step": 2700 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3125, - "step": 2701 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.938, - "step": 2702 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0983, - "step": 2703 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8028, - "step": 2704 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4752, - "step": 2705 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.16, - "step": 2706 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2429, - "step": 2707 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.623, - "step": 2708 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9595, - "step": 2709 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5444, - "step": 2710 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6245, - "step": 2711 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.919, - "step": 2712 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7332, - "step": 2713 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0067, - "step": 2714 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6957, - "step": 2715 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.994, - "step": 2716 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7562, - "step": 2717 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6882, - "step": 2718 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8732, - "step": 2719 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6496, - "step": 2720 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4841, - "step": 2721 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4927, - "step": 2722 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7437, - "step": 2723 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9469, - "step": 2724 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1481, - "step": 2725 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7762, - "step": 2726 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8806, - "step": 2727 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8352, - "step": 2728 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9488, - "step": 2729 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1679, - "step": 2730 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2412, - "step": 2731 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6839, - "step": 2732 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2733 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 2734 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8754, - "step": 2735 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9007, - "step": 2736 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9134, - "step": 2737 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2738 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9293, - "step": 2739 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0489, - "step": 2740 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4197, - "step": 2741 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3667, - "step": 2742 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8059, - "step": 2743 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.024, - "step": 2744 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0756, - "step": 2745 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0596, - "step": 2746 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1416, - "step": 2747 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1271, - "step": 2748 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1452, - "step": 2749 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9527, - "step": 2750 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9189, - "step": 2751 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4364, - "step": 2752 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 2753 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4034, - "step": 2754 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6678, - "step": 2755 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 2756 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7751, - "step": 2757 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0164, - "step": 2758 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5796, - "step": 2759 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7851, - "step": 2760 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1784, - "step": 2761 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7417, - "step": 2762 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4268, - "step": 2763 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6919, - "step": 2764 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1838, - "step": 2765 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5592, - "step": 2766 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.425, - "step": 2767 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.848, - "step": 2768 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5683, - "step": 2769 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0262, - "step": 2770 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8063, - "step": 2771 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6139, - "step": 2772 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3478, - "step": 2773 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1811, - "step": 2774 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4519, - "step": 2775 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0071, - "step": 2776 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7872, - "step": 2777 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2263, - "step": 2778 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8923, - "step": 2779 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2997, - "step": 2780 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6857, - "step": 2781 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8874, - "step": 2782 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8203, - "step": 2783 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9583, - "step": 2784 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0814, - "step": 2785 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.419, - "step": 2786 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3063, - "step": 2787 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1285, - "step": 2788 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0039, - "step": 2789 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.671, - "step": 2790 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5452, - "step": 2791 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3885, - "step": 2792 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6282, - "step": 2793 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5913, - "step": 2794 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6189, - "step": 2795 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2968, - "step": 2796 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 2797 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9897, - "step": 2798 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8193, - "step": 2799 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7042, - "step": 2800 - }, - { - "epoch": 0.02, - "eval_loss": 6.604581832885742, - "eval_runtime": 22.516, - "eval_samples_per_second": 2.221, - "eval_steps_per_second": 1.11, - "step": 2800 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.006761569976806, - "step": 2800 - }, - { - "epoch": 0.02, - "step": 2800, - "total_flos": 4.660001608148582e+16, - "train_loss": 6.312225336258395, - "train_runtime": 7855.0688, - "train_samples_per_second": 3.819, - "train_steps_per_second": 3.819 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0757, - "step": 2801 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8944, - "step": 2802 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8365, - "step": 2803 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.873, - "step": 2804 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3793, - "step": 2805 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1923, - "step": 2806 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2306, - "step": 2807 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4439, - "step": 2808 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3377, - "step": 2809 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8737, - "step": 2810 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4191, - "step": 2811 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.892, - "step": 2812 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 2813 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 2814 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6912, - "step": 2815 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9033, - "step": 2816 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4433, - "step": 2817 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7591, - "step": 2818 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4458, - "step": 2819 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3721, - "step": 2820 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4111, - "step": 2821 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0999, - "step": 2822 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5154, - "step": 2823 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1967, - "step": 2824 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8437, - "step": 2825 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.759, - "step": 2826 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6223, - "step": 2827 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3442, - "step": 2828 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1991, - "step": 2829 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5302, - "step": 2830 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1725, - "step": 2831 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8107, - "step": 2832 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7891, - "step": 2833 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5592, - "step": 2834 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8792, - "step": 2835 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2387, - "step": 2836 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9369, - "step": 2837 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2098, - "step": 2838 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6645, - "step": 2839 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2349, - "step": 2840 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8613, - "step": 2841 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5482, - "step": 2842 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5438, - "step": 2843 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6133, - "step": 2844 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9983, - "step": 2845 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8706, - "step": 2846 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9685, - "step": 2847 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.04, - "step": 2848 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6498, - "step": 2849 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6786, - "step": 2850 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.789, - "step": 2851 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1116, - "step": 2852 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7085, - "step": 2853 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1083, - "step": 2854 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0795, - "step": 2855 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8677, - "step": 2856 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1663, - "step": 2857 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5379, - "step": 2858 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4923, - "step": 2859 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1456, - "step": 2860 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1686, - "step": 2861 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4153, - "step": 2862 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.17, - "step": 2863 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 2864 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7106, - "step": 2865 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.463, - "step": 2866 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.614, - "step": 2867 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1451, - "step": 2868 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6817, - "step": 2869 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9577, - "step": 2870 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6171, - "step": 2871 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5129, - "step": 2872 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3386, - "step": 2873 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1176, - "step": 2874 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9075, - "step": 2875 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.667, - "step": 2876 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8097, - "step": 2877 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7707, - "step": 2878 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7571, - "step": 2879 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0732, - "step": 2880 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5586, - "step": 2881 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8045, - "step": 2882 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4442, - "step": 2883 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.418, - "step": 2884 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7253, - "step": 2885 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4227, - "step": 2886 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9878, - "step": 2887 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 2888 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1715, - "step": 2889 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1207, - "step": 2890 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0396, - "step": 2891 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 2892 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2893 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4087, - "step": 2894 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4509, - "step": 2895 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8165, - "step": 2896 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9842, - "step": 2897 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.574, - "step": 2898 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4512, - "step": 2899 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9904, - "step": 2900 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6141, - "step": 2901 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9528, - "step": 2902 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9028, - "step": 2903 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3868, - "step": 2904 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0461, - "step": 2905 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5161, - "step": 2906 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.107, - "step": 2907 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7174, - "step": 2908 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7671, - "step": 2909 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6803, - "step": 2910 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5357, - "step": 2911 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6898, - "step": 2912 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8564, - "step": 2913 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1457, - "step": 2914 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3408, - "step": 2915 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6095, - "step": 2916 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.866, - "step": 2917 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7324, - "step": 2918 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4026, - "step": 2919 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1467, - "step": 2920 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2496, - "step": 2921 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5265, - "step": 2922 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8246, - "step": 2923 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5775, - "step": 2924 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2886, - "step": 2925 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3076, - "step": 2926 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7975, - "step": 2927 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9898, - "step": 2928 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7256, - "step": 2929 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7971, - "step": 2930 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5884, - "step": 2931 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0749, - "step": 2932 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6183, - "step": 2933 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0019, - "step": 2934 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1706, - "step": 2935 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4075, - "step": 2936 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4092, - "step": 2937 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9424, - "step": 2938 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9025, - "step": 2939 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7751, - "step": 2940 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.988, - "step": 2941 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1208, - "step": 2942 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1999, - "step": 2943 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2129, - "step": 2944 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 2945 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 2946 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8246, - "step": 2947 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4876, - "step": 2948 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7158, - "step": 2949 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3611, - "step": 2950 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9919, - "step": 2951 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4136, - "step": 2952 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.801, - "step": 2953 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6503, - "step": 2954 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.553, - "step": 2955 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3536, - "step": 2956 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8516, - "step": 2957 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9344, - "step": 2958 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8727, - "step": 2959 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9155, - "step": 2960 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9684, - "step": 2961 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0399, - "step": 2962 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4298, - "step": 2963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4559, - "step": 2964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0361, - "step": 2965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0081, - "step": 2966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6698, - "step": 2967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3355, - "step": 2968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7555, - "step": 2969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.29, - "step": 2970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4079, - "step": 2971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0588, - "step": 2972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2956, - "step": 2973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7318, - "step": 2974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8846, - "step": 2975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5159, - "step": 2976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7629, - "step": 2977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2039, - "step": 2978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.311, - "step": 2979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9786, - "step": 2980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7138, - "step": 2981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4827, - "step": 2982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5264, - "step": 2983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8153, - "step": 2984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3512, - "step": 2985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1515, - "step": 2986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1689, - "step": 2987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8027, - "step": 2988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7472, - "step": 2989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0236, - "step": 2990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1741, - "step": 2991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8884, - "step": 2992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3912, - "step": 2993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2109, - "step": 2994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1693, - "step": 2995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8166, - "step": 2996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4902, - "step": 2997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3921, - "step": 2998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8787, - "step": 2999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1304, - "step": 3000 - }, - { - "epoch": 0.02, - "eval_loss": 6.659167289733887, - "eval_runtime": 22.4512, - "eval_samples_per_second": 2.227, - "eval_steps_per_second": 1.114, - "step": 3000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.973116703033447, - "step": 3000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4366, - "step": 3001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1239, - "step": 3002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.419, - "step": 3003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7662, - "step": 3004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1246, - "step": 3005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3498, - "step": 3006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1245, - "step": 3007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6488, - "step": 3008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3641, - "step": 3009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7842, - "step": 3010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.802, - "step": 3011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1015, - "step": 3012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9015, - "step": 3013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8768, - "step": 3014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7296, - "step": 3015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4175, - "step": 3016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3568, - "step": 3017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5869, - "step": 3018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5304, - "step": 3019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1129, - "step": 3020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8128, - "step": 3021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1541, - "step": 3022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3002, - "step": 3023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0616, - "step": 3024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3152, - "step": 3025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4433, - "step": 3026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8406, - "step": 3027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2653, - "step": 3028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7041, - "step": 3029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3463, - "step": 3030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7164, - "step": 3031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9969, - "step": 3032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1306, - "step": 3033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0204, - "step": 3034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6743, - "step": 3035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3186, - "step": 3036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5673, - "step": 3037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1416, - "step": 3038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1956, - "step": 3039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6836, - "step": 3040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0172, - "step": 3041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.796, - "step": 3042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6945, - "step": 3043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5079, - "step": 3044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.399, - "step": 3045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0071, - "step": 3046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4171, - "step": 3047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0925, - "step": 3048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6842, - "step": 3049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2442, - "step": 3050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8023, - "step": 3051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7147, - "step": 3052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9681, - "step": 3053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1882, - "step": 3054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9869, - "step": 3055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0705, - "step": 3056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8802, - "step": 3057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8399, - "step": 3058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6438, - "step": 3059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0608, - "step": 3060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.763, - "step": 3061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.355, - "step": 3062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5504, - "step": 3063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1582, - "step": 3064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1922, - "step": 3065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0706, - "step": 3066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.807, - "step": 3067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0305, - "step": 3068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0774, - "step": 3069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4469, - "step": 3070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1158, - "step": 3071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8087, - "step": 3072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5672, - "step": 3073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5864, - "step": 3074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7665, - "step": 3075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2434, - "step": 3076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3988, - "step": 3077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0307, - "step": 3078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6561, - "step": 3079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8158, - "step": 3080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8384, - "step": 3081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5515, - "step": 3082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8108, - "step": 3083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2536, - "step": 3084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2506, - "step": 3085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1605, - "step": 3086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4572, - "step": 3087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3312, - "step": 3088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1481, - "step": 3089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3304, - "step": 3090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2617, - "step": 3091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3148, - "step": 3092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4781, - "step": 3093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.327, - "step": 3094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3288, - "step": 3095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2354, - "step": 3096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4802, - "step": 3097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1659, - "step": 3098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9887, - "step": 3099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9497, - "step": 3100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2668, - "step": 3101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.458, - "step": 3102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9919, - "step": 3103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0408, - "step": 3104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9966, - "step": 3105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1371, - "step": 3106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0939, - "step": 3107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2532, - "step": 3108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7343, - "step": 3109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.156, - "step": 3110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2223, - "step": 3111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6182, - "step": 3112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4423, - "step": 3113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3855, - "step": 3114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2115, - "step": 3115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6719, - "step": 3116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5285, - "step": 3117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0171, - "step": 3118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2127, - "step": 3119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8153, - "step": 3120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1838, - "step": 3121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.711, - "step": 3122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1568, - "step": 3123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3741, - "step": 3124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2741, - "step": 3125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1653, - "step": 3126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9722, - "step": 3127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9181, - "step": 3128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.384, - "step": 3129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1491, - "step": 3130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8641, - "step": 3131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6125, - "step": 3132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1702, - "step": 3133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4853, - "step": 3134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7929, - "step": 3135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8747, - "step": 3136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2659, - "step": 3137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0685, - "step": 3138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2297, - "step": 3139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0715, - "step": 3140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2201, - "step": 3141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2134, - "step": 3142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6098, - "step": 3143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2036, - "step": 3144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2481, - "step": 3145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4508, - "step": 3146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1454, - "step": 3147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7573, - "step": 3148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2946, - "step": 3149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0121, - "step": 3150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.498, - "step": 3151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 3152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5934, - "step": 3153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.737, - "step": 3154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9874, - "step": 3155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7714, - "step": 3156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3529, - "step": 3157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7673, - "step": 3158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3835, - "step": 3159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0336, - "step": 3160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2998, - "step": 3161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0446, - "step": 3162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5225, - "step": 3163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1391, - "step": 3164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7669, - "step": 3165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.118, - "step": 3166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7045, - "step": 3167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.077, - "step": 3168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0697, - "step": 3169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8445, - "step": 3170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4044, - "step": 3171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9817, - "step": 3172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.2508, - "step": 3173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6302, - "step": 3174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.46, - "step": 3175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8094, - "step": 3176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1174, - "step": 3177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5756, - "step": 3178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4215, - "step": 3179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5957, - "step": 3180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 3181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9342, - "step": 3182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8227, - "step": 3183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1404, - "step": 3184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9897, - "step": 3185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7286, - "step": 3186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5414, - "step": 3187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7523, - "step": 3188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4033, - "step": 3189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8593, - "step": 3190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6095, - "step": 3191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.504, - "step": 3192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 3193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0174, - "step": 3194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7759, - "step": 3195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5685, - "step": 3196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2786, - "step": 3197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7794, - "step": 3198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5827, - "step": 3199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3282, - "step": 3200 - }, - { - "epoch": 0.02, - "eval_loss": 6.423073768615723, - "eval_runtime": 22.4644, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 3200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.9956862831115725, - "step": 3200 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 5.284691537608704e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-3200/training_args.bin b/checkpoint-3200/training_args.bin deleted file mode 100644 index 53a16291359ea01b885cc36189679e385fee54a8..0000000000000000000000000000000000000000 --- a/checkpoint-3200/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2f399ab69470e06aaa321f2990a85c1505da75b9e960c095081ae355addfd1d -size 6011 diff --git a/checkpoint-3400/README.md b/checkpoint-3400/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-3400/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-3400/adapter_config.json b/checkpoint-3400/adapter_config.json deleted file mode 100644 index 2adcd7d22e9c842efe5230fdbfc7ae7a84aededb..0000000000000000000000000000000000000000 --- a/checkpoint-3400/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "q_proj", - "o_proj", - "k_proj", - "gate_proj", - "down_proj", - "v_proj", - "up_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-3400/adapter_model.bin b/checkpoint-3400/adapter_model.bin deleted file mode 100644 index 244b01794acc537fb109dc649ea48daef566e2b5..0000000000000000000000000000000000000000 --- a/checkpoint-3400/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15b3881839fd2acad061e23d9b23ac58d5cc9224d94ead590ff1540028d9e50e -size 871609293 diff --git a/checkpoint-3400/added_tokens.json b/checkpoint-3400/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-3400/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-3400/optimizer.pt b/checkpoint-3400/optimizer.pt deleted file mode 100644 index c265a12da2ee958b04d359e81e474b1e6a94ff70..0000000000000000000000000000000000000000 --- a/checkpoint-3400/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:90be3aa2d6d3d55f3d07ec3ba563395e78370db349bc63ea4347cba045727c4b -size 873873439 diff --git a/checkpoint-3400/rng_state.pth b/checkpoint-3400/rng_state.pth deleted file mode 100644 index 9be755892a020892c210246df801fc266a58408d..0000000000000000000000000000000000000000 --- a/checkpoint-3400/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ad79a64681714333886bacb7a33b39fd9a08cd915f5b53a0695fc75adf31372 -size 14511 diff --git a/checkpoint-3400/scheduler.pt b/checkpoint-3400/scheduler.pt deleted file mode 100644 index c88162200a06347eac69bcd2d6ca4adb6ef01c14..0000000000000000000000000000000000000000 --- a/checkpoint-3400/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:375ff21cc0ac3c3d2481c3e10491bf0755513bd8242939e41e1aee1a2d5b88f8 -size 627 diff --git a/checkpoint-3400/special_tokens_map.json b/checkpoint-3400/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-3400/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-3400/tokenizer.model b/checkpoint-3400/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-3400/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-3400/tokenizer_config.json b/checkpoint-3400/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-3400/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-3400/trainer_state.json b/checkpoint-3400/trainer_state.json deleted file mode 100644 index 889e265972094322b8b9393ca04e28d3b7448022..0000000000000000000000000000000000000000 --- a/checkpoint-3400/trainer_state.json +++ /dev/null @@ -1,20731 +0,0 @@ -{ - "best_metric": 6.423073768615723, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-3200", - "epoch": 0.02597204186082041, - "global_step": 3400, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0808, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8773, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1965, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.118, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1773, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1165, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2666, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3704, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9976, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.985, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.0541, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.6228, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.3651, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0867, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4422, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.7759, - "step": 16 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1446, - "step": 17 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0007, - "step": 18 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.0894, - "step": 19 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2424, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1343, - "step": 21 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5354, - "step": 22 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1887, - "step": 23 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6652, - "step": 24 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.964, - "step": 25 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1872, - "step": 26 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.4722, - "step": 27 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.1462, - "step": 28 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0485, - "step": 29 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.148, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7274, - "step": 31 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.6689, - "step": 32 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3384, - "step": 33 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.5354, - "step": 34 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1976, - "step": 35 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.8593, - "step": 36 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9302, - "step": 37 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5968, - "step": 38 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3169, - "step": 39 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1793, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8457, - "step": 41 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.5177, - "step": 42 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.003, - "step": 43 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.9928, - "step": 44 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.2574, - "step": 45 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 0.3915, - "step": 46 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4105, - "step": 47 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.1184, - "step": 48 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.72, - "step": 49 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9628, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2372, - "step": 51 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3733, - "step": 52 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8936, - "step": 53 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5353, - "step": 54 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0754, - "step": 55 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6685, - "step": 56 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8984, - "step": 57 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2265, - "step": 58 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 59 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7349, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0221, - "step": 61 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.1901, - "step": 62 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.387, - "step": 63 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7323, - "step": 64 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2077, - "step": 65 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3155, - "step": 66 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1656, - "step": 67 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 13.0828, - "step": 68 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5295, - "step": 69 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4575, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.7654, - "step": 71 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6263, - "step": 72 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 24.8238, - "step": 73 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.0654, - "step": 74 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 28.1046, - "step": 75 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.3232, - "step": 76 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 22.9712, - "step": 77 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 18.8529, - "step": 78 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 15.8356, - "step": 79 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 16.472, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.2369, - "step": 81 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 14.0731, - "step": 82 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8853, - "step": 83 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5438, - "step": 84 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2665, - "step": 85 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5484, - "step": 86 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7546, - "step": 87 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4309, - "step": 88 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5593, - "step": 89 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3822, - "step": 90 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6315, - "step": 91 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6116, - "step": 92 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2288, - "step": 93 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0053, - "step": 94 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.359, - "step": 95 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9235, - "step": 96 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 31.9845, - "step": 97 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.1385, - "step": 98 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6161, - "step": 99 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8096, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9918, - "step": 101 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.344, - "step": 102 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1607, - "step": 103 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4834, - "step": 104 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.704, - "step": 105 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1238, - "step": 106 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8066, - "step": 107 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9656, - "step": 108 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 109 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2294, - "step": 110 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.066, - "step": 111 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7914, - "step": 112 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7344, - "step": 113 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6703, - "step": 114 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8817, - "step": 115 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7733, - "step": 116 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.469, - "step": 117 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1304, - "step": 118 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.871, - "step": 119 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5353, - "step": 120 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9055, - "step": 121 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6142, - "step": 122 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0201, - "step": 123 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3805, - "step": 124 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6825, - "step": 125 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7166, - "step": 126 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7747, - "step": 127 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7695, - "step": 128 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7291, - "step": 129 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1296, - "step": 130 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5374, - "step": 131 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1854, - "step": 132 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.434, - "step": 133 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.438, - "step": 134 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 135 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.382, - "step": 136 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9277, - "step": 137 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.223, - "step": 138 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3042, - "step": 139 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6361, - "step": 140 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3547, - "step": 141 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7181, - "step": 142 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.7528, - "step": 143 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.4316, - "step": 144 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2219, - "step": 145 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7788, - "step": 146 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2749, - "step": 147 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2397, - "step": 148 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6243, - "step": 149 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 150 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7951, - "step": 151 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1862, - "step": 152 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.1305, - "step": 153 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5766, - "step": 154 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9232, - "step": 155 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9936, - "step": 156 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9692, - "step": 157 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2772, - "step": 158 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.302, - "step": 159 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9931, - "step": 160 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9675, - "step": 161 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.8536, - "step": 162 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6589, - "step": 163 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.932, - "step": 164 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0301, - "step": 165 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4861, - "step": 166 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1354, - "step": 167 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0717, - "step": 168 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9346, - "step": 169 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9373, - "step": 170 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8777, - "step": 171 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4193, - "step": 172 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6831, - "step": 173 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4175, - "step": 174 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3629, - "step": 175 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.118, - "step": 176 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 177 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8355, - "step": 178 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4522, - "step": 179 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9272, - "step": 180 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4631, - "step": 181 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2987, - "step": 182 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1183, - "step": 183 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9976, - "step": 184 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0668, - "step": 185 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6291, - "step": 186 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5937, - "step": 187 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7382, - "step": 188 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7677, - "step": 189 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0293, - "step": 190 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6407, - "step": 191 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9508, - "step": 192 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.5053, - "step": 193 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5718, - "step": 194 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5211, - "step": 195 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9557, - "step": 196 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1609, - "step": 197 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8505, - "step": 198 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8278, - "step": 199 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8447, - "step": 200 - }, - { - "epoch": 0.0, - "eval_loss": 7.883856773376465, - "eval_runtime": 22.4254, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 200 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.629522514343262, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3249, - "step": 201 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.352, - "step": 202 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2984, - "step": 203 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.2734, - "step": 204 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1, - "step": 205 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 206 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2387, - "step": 207 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.861, - "step": 208 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.603, - "step": 209 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.29, - "step": 210 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2105, - "step": 211 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1949, - "step": 212 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0538, - "step": 213 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0343, - "step": 214 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7794, - "step": 215 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5532, - "step": 216 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2676, - "step": 217 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 218 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0432, - "step": 219 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9391, - "step": 220 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.724, - "step": 221 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.229, - "step": 222 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3462, - "step": 223 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0752, - "step": 224 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.1966, - "step": 225 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7279, - "step": 226 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8484, - "step": 227 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7291, - "step": 228 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2665, - "step": 229 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3551, - "step": 230 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7338, - "step": 231 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8407, - "step": 232 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3581, - "step": 233 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.441, - "step": 234 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0788, - "step": 235 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8404, - "step": 236 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4314, - "step": 237 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 238 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.0205, - "step": 239 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4162, - "step": 240 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7515, - "step": 241 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1442, - "step": 242 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5868, - "step": 243 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6514, - "step": 244 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2683, - "step": 245 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.31, - "step": 246 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0161, - "step": 247 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.484, - "step": 248 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9726, - "step": 249 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.0926, - "step": 250 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5279, - "step": 251 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0017, - "step": 252 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5684, - "step": 253 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 254 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9489, - "step": 255 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.8948, - "step": 256 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0856, - "step": 257 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.599, - "step": 258 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1575, - "step": 259 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3701, - "step": 260 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.464, - "step": 261 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9193, - "step": 262 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5679, - "step": 263 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9424, - "step": 264 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6689, - "step": 265 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6475, - "step": 266 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4311, - "step": 267 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7426, - "step": 268 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5191, - "step": 269 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3059, - "step": 270 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0142, - "step": 271 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.4509, - "step": 272 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0831, - "step": 273 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6977, - "step": 274 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4236, - "step": 275 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2129, - "step": 276 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1394, - "step": 277 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.685, - "step": 278 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0275, - "step": 279 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.3215, - "step": 280 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6542, - "step": 281 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7614, - "step": 282 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2996, - "step": 283 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6275, - "step": 284 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8736, - "step": 285 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4667, - "step": 286 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8486, - "step": 287 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2125, - "step": 288 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4523, - "step": 289 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.551, - "step": 290 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.7158, - "step": 291 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5092, - "step": 292 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9169, - "step": 293 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5333, - "step": 294 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9949, - "step": 295 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7189, - "step": 296 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2366, - "step": 297 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4745, - "step": 298 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2439, - "step": 299 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4176, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.9365, - "step": 301 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5309, - "step": 302 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2201, - "step": 303 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0312, - "step": 304 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 305 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4856, - "step": 306 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5041, - "step": 307 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3597, - "step": 308 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8395, - "step": 309 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0776, - "step": 310 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7566, - "step": 311 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9767, - "step": 312 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.3804, - "step": 313 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5327, - "step": 314 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5293, - "step": 315 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4531, - "step": 316 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3961, - "step": 317 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5669, - "step": 318 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8559, - "step": 319 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.117, - "step": 320 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4279, - "step": 321 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7977, - "step": 322 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.955, - "step": 323 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0164, - "step": 324 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 12.0495, - "step": 325 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2768, - "step": 326 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3162, - "step": 327 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.88, - "step": 328 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2157, - "step": 329 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8427, - "step": 330 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9729, - "step": 331 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1779, - "step": 332 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 333 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7705, - "step": 334 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.523, - "step": 335 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9375, - "step": 336 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.1409, - "step": 337 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.633, - "step": 338 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6481, - "step": 339 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.933, - "step": 340 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9179, - "step": 341 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9332, - "step": 342 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6553, - "step": 343 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7412, - "step": 344 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.849, - "step": 345 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.7321, - "step": 346 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9717, - "step": 347 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3465, - "step": 348 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4535, - "step": 349 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2376, - "step": 350 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9025, - "step": 351 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.916, - "step": 352 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.3785, - "step": 353 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0576, - "step": 354 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5081, - "step": 355 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1303, - "step": 356 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3854, - "step": 357 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 11.5553, - "step": 358 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9627, - "step": 359 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.402, - "step": 360 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3484, - "step": 361 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5428, - "step": 362 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9128, - "step": 363 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3934, - "step": 364 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4812, - "step": 365 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5395, - "step": 366 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6304, - "step": 367 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5626, - "step": 368 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5693, - "step": 369 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3458, - "step": 370 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6254, - "step": 371 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8706, - "step": 372 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6076, - "step": 373 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.2912, - "step": 374 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3326, - "step": 375 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3735, - "step": 376 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4916, - "step": 377 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5553, - "step": 378 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6241, - "step": 379 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6106, - "step": 380 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.266, - "step": 381 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7738, - "step": 382 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4988, - "step": 383 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 384 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8512, - "step": 385 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0341, - "step": 386 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.898, - "step": 387 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.23, - "step": 388 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9608, - "step": 389 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.3679, - "step": 390 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.7074, - "step": 391 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9903, - "step": 392 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5845, - "step": 393 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6493, - "step": 394 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7962, - "step": 395 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4865, - "step": 396 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 397 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3942, - "step": 398 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4715, - "step": 399 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.2073, - "step": 400 - }, - { - "epoch": 0.0, - "eval_loss": 7.106412410736084, - "eval_runtime": 22.5667, - "eval_samples_per_second": 2.216, - "eval_steps_per_second": 1.108, - "step": 400 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 2.9128687667846678, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3984, - "step": 401 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7983, - "step": 402 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.8589, - "step": 403 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9884, - "step": 404 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.4427, - "step": 405 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0374, - "step": 406 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7999, - "step": 407 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2437, - "step": 408 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6902, - "step": 409 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.81, - "step": 410 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8979, - "step": 411 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0211, - "step": 412 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3945, - "step": 413 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.5807, - "step": 414 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1433, - "step": 415 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9466, - "step": 416 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6276, - "step": 417 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4945, - "step": 418 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6215, - "step": 419 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.3919, - "step": 420 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7915, - "step": 421 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3284, - "step": 422 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8723, - "step": 423 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0149, - "step": 424 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.979, - "step": 425 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 426 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4994, - "step": 427 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.9791, - "step": 428 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1156, - "step": 429 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5813, - "step": 430 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1882, - "step": 431 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9956, - "step": 432 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6189, - "step": 433 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9624, - "step": 434 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5387, - "step": 435 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4605, - "step": 436 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.474, - "step": 437 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 438 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5705, - "step": 439 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.275, - "step": 440 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9638, - "step": 441 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4857, - "step": 442 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3067, - "step": 443 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8152, - "step": 444 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1668, - "step": 445 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5293, - "step": 446 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3981, - "step": 447 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4787, - "step": 448 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5981, - "step": 449 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3569, - "step": 450 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4088, - "step": 451 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3677, - "step": 452 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4686, - "step": 453 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3552, - "step": 454 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7931, - "step": 455 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.9285, - "step": 456 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0554, - "step": 457 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7277, - "step": 458 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2474, - "step": 459 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9274, - "step": 460 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2558, - "step": 461 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7547, - "step": 462 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 463 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2124, - "step": 464 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8751, - "step": 465 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7317, - "step": 466 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3697, - "step": 467 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0021, - "step": 468 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3761, - "step": 469 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2291, - "step": 470 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7968, - "step": 471 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9454, - "step": 472 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0194, - "step": 473 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5048, - "step": 474 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6837, - "step": 475 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1066, - "step": 476 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3501, - "step": 477 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5071, - "step": 478 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1086, - "step": 479 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7269, - "step": 480 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5419, - "step": 481 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 482 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.1433, - "step": 483 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0869, - "step": 484 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.032, - "step": 485 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0946, - "step": 486 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 487 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.0406, - "step": 488 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9048, - "step": 489 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2231, - "step": 490 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.6524, - "step": 491 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1151, - "step": 492 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.591, - "step": 493 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.1628, - "step": 494 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0757, - "step": 495 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.3471, - "step": 496 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9385, - "step": 497 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9362, - "step": 498 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2252, - "step": 499 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.359, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0497, - "step": 501 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0484, - "step": 502 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5773, - "step": 503 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.39, - "step": 504 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5923, - "step": 505 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2, - "step": 506 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5536, - "step": 507 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.8958, - "step": 508 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7763, - "step": 509 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2045, - "step": 510 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4219, - "step": 511 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6305, - "step": 512 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.4243, - "step": 513 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7842, - "step": 514 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8769, - "step": 515 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8903, - "step": 516 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0489, - "step": 517 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1314, - "step": 518 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.5973, - "step": 519 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.8022, - "step": 520 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3539, - "step": 521 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.222, - "step": 522 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5403, - "step": 523 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.1323, - "step": 524 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7813, - "step": 525 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 526 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2426, - "step": 527 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0142, - "step": 528 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8996, - "step": 529 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.8671, - "step": 530 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4139, - "step": 531 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9478, - "step": 532 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7062, - "step": 533 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0098, - "step": 534 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9195, - "step": 535 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0255, - "step": 536 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.6291, - "step": 537 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3245, - "step": 538 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6382, - "step": 539 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 1.8076, - "step": 540 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6725, - "step": 541 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0563, - "step": 542 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.6178, - "step": 543 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7974, - "step": 544 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.7535, - "step": 545 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4948, - "step": 546 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8941, - "step": 547 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.6496, - "step": 548 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.9084, - "step": 549 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.65, - "step": 550 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7673, - "step": 551 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 10.2221, - "step": 552 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.14, - "step": 553 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.6747, - "step": 554 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8009, - "step": 555 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7307, - "step": 556 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 557 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.8098, - "step": 558 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.026, - "step": 559 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4572, - "step": 560 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7913, - "step": 561 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9962, - "step": 562 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.767, - "step": 563 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.9497, - "step": 564 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.9626, - "step": 565 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2536, - "step": 566 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0421, - "step": 567 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.8177, - "step": 568 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9241, - "step": 569 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0162, - "step": 570 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.3368, - "step": 571 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7515, - "step": 572 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.6389, - "step": 573 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.662, - "step": 574 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8097, - "step": 575 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9346, - "step": 576 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.3154, - "step": 577 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.7724, - "step": 578 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3685, - "step": 579 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2775, - "step": 580 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.106, - "step": 581 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4733, - "step": 582 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2334, - "step": 583 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9478, - "step": 584 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0013, - "step": 585 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.7242, - "step": 586 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.922, - "step": 587 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.1418, - "step": 588 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4472, - "step": 589 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.4785, - "step": 590 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.783, - "step": 591 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.0706, - "step": 592 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4136, - "step": 593 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.5969, - "step": 594 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5157, - "step": 595 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.5658, - "step": 596 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 597 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.2028, - "step": 598 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.6913, - "step": 599 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7239, - "step": 600 - }, - { - "epoch": 0.0, - "eval_loss": 7.012163162231445, - "eval_runtime": 22.5807, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 600 - }, - { - "epoch": 0.0, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.24488224029541, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.5253, - "step": 601 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0392, - "step": 602 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.447, - "step": 603 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.9441, - "step": 604 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.1874, - "step": 605 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.7817, - "step": 606 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0348, - "step": 607 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.5593, - "step": 608 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.9361, - "step": 609 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 610 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.476, - "step": 611 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0937, - "step": 612 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3027, - "step": 613 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.5586, - "step": 614 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3796, - "step": 615 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.676, - "step": 616 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5321, - "step": 617 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.0059, - "step": 618 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 619 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.2391, - "step": 620 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.0636, - "step": 621 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0895, - "step": 622 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.62, - "step": 623 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.0469, - "step": 624 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 625 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 3.9432, - "step": 626 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.3928, - "step": 627 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0959, - "step": 628 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.1197, - "step": 629 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 2.4277, - "step": 630 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.418, - "step": 631 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8687, - "step": 632 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.0156, - "step": 633 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.573, - "step": 634 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.112, - "step": 635 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8954, - "step": 636 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.36, - "step": 637 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.924, - "step": 638 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.4625, - "step": 639 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.2023, - "step": 640 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.0685, - "step": 641 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.5304, - "step": 642 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4456, - "step": 643 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.7271, - "step": 644 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.6011, - "step": 645 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.895, - "step": 646 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 4.864, - "step": 647 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.3452, - "step": 648 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 5.8978, - "step": 649 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.2253, - "step": 650 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 9.2813, - "step": 651 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 7.7248, - "step": 652 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 8.4283, - "step": 653 - }, - { - "epoch": 0.0, - "learning_rate": 0.0004, - "loss": 6.4304, - "step": 654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3893, - "step": 655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1115, - "step": 656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5892, - "step": 657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6572, - "step": 658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.925, - "step": 659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4431, - "step": 660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7711, - "step": 661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9439, - "step": 662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3781, - "step": 663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5573, - "step": 664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.4476, - "step": 665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0057, - "step": 666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2702, - "step": 667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5717, - "step": 668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2242, - "step": 669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1, - "step": 670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0517, - "step": 671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6543, - "step": 672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1138, - "step": 673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.461, - "step": 674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7094, - "step": 675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7116, - "step": 677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6343, - "step": 678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3762, - "step": 679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3603, - "step": 680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7144, - "step": 681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4545, - "step": 682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8188, - "step": 683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7965, - "step": 684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4675, - "step": 685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0436, - "step": 686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1219, - "step": 687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4517, - "step": 688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8476, - "step": 689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9284, - "step": 690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7405, - "step": 691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7142, - "step": 692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3979, - "step": 693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.3285, - "step": 694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3418, - "step": 695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4472, - "step": 696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7355, - "step": 697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7982, - "step": 698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4516, - "step": 699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2532, - "step": 700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9959, - "step": 701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0418, - "step": 702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7767, - "step": 703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.774, - "step": 704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8912, - "step": 705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6197, - "step": 707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4755, - "step": 708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8276, - "step": 709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2925, - "step": 710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3887, - "step": 711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1465, - "step": 712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5806, - "step": 713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3063, - "step": 714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6066, - "step": 715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1536, - "step": 716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5582, - "step": 717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0353, - "step": 718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8291, - "step": 720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.7575, - "step": 721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9141, - "step": 722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5217, - "step": 723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4549, - "step": 724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8112, - "step": 725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2729, - "step": 726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8515, - "step": 727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9712, - "step": 728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.097, - "step": 729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0208, - "step": 730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1377, - "step": 731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4019, - "step": 732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9869, - "step": 733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2954, - "step": 734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4144, - "step": 735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8053, - "step": 736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8891, - "step": 737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.812, - "step": 738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2657, - "step": 739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3747, - "step": 740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0364, - "step": 741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8845, - "step": 742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.887, - "step": 743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0706, - "step": 744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6619, - "step": 745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2941, - "step": 746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9192, - "step": 747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9947, - "step": 748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6376, - "step": 749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0358, - "step": 750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4578, - "step": 751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7784, - "step": 752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8649, - "step": 754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7951, - "step": 755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3841, - "step": 756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4558, - "step": 757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7638, - "step": 758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9413, - "step": 759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0916, - "step": 760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1351, - "step": 761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6078, - "step": 762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7982, - "step": 763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6132, - "step": 764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.551, - "step": 765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4888, - "step": 767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1476, - "step": 768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4244, - "step": 769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6025, - "step": 770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.102, - "step": 771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.017, - "step": 772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4101, - "step": 773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1741, - "step": 774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1256, - "step": 775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6959, - "step": 777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7666, - "step": 778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4336, - "step": 779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 11.8478, - "step": 780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8382, - "step": 781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4424, - "step": 783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.345, - "step": 784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6887, - "step": 785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9867, - "step": 786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6152, - "step": 787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7283, - "step": 788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0157, - "step": 789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6044, - "step": 790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4132, - "step": 791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.735, - "step": 792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3631, - "step": 793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2308, - "step": 794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2184, - "step": 795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4661, - "step": 796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9637, - "step": 797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4178, - "step": 798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5909, - "step": 799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1482, - "step": 800 - }, - { - "epoch": 0.01, - "eval_loss": 7.355834484100342, - "eval_runtime": 22.6252, - "eval_samples_per_second": 2.21, - "eval_steps_per_second": 1.105, - "step": 800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 5.191131496429444, - "step": 800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.0427, - "step": 801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2669, - "step": 802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8026, - "step": 803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4949, - "step": 804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4491, - "step": 805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0383, - "step": 806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1213, - "step": 807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5158, - "step": 808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5648, - "step": 809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9932, - "step": 810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6441, - "step": 811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8661, - "step": 812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3609, - "step": 813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6828, - "step": 814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9693, - "step": 815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3733, - "step": 816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6286, - "step": 817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4349, - "step": 818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6706, - "step": 819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3089, - "step": 820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2394, - "step": 821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.963, - "step": 822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6564, - "step": 823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.997, - "step": 824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9261, - "step": 825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1421, - "step": 826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3432, - "step": 828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0154, - "step": 829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5135, - "step": 830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6226, - "step": 831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1672, - "step": 832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0853, - "step": 833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1213, - "step": 834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7815, - "step": 835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8916, - "step": 836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6464, - "step": 837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3307, - "step": 838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.886, - "step": 840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4781, - "step": 841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8351, - "step": 842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.358, - "step": 843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6501, - "step": 844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0864, - "step": 845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2922, - "step": 846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9847, - "step": 847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2558, - "step": 848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0195, - "step": 849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.996, - "step": 850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5705, - "step": 851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4136, - "step": 852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6302, - "step": 853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8761, - "step": 854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4995, - "step": 855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4762, - "step": 856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5749, - "step": 857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0273, - "step": 858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8258, - "step": 859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1836, - "step": 860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5493, - "step": 861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1891, - "step": 862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7392, - "step": 863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1655, - "step": 864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5218, - "step": 865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3759, - "step": 866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2497, - "step": 867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5901, - "step": 868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0624, - "step": 869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.2452, - "step": 870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0826, - "step": 872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2703, - "step": 873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9088, - "step": 874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3875, - "step": 875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2511, - "step": 876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4065, - "step": 877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.175, - "step": 878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8358, - "step": 879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3208, - "step": 880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2049, - "step": 881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8251, - "step": 882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4262, - "step": 883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2227, - "step": 884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1062, - "step": 885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9417, - "step": 886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3315, - "step": 887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0012, - "step": 888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6386, - "step": 889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0377, - "step": 890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6707, - "step": 891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4955, - "step": 892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7343, - "step": 893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8305, - "step": 894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7149, - "step": 896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5649, - "step": 897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.815, - "step": 898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6135, - "step": 899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8776, - "step": 900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7288, - "step": 901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8019, - "step": 902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0921, - "step": 903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.824, - "step": 904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7151, - "step": 905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5054, - "step": 906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8095, - "step": 907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3218, - "step": 908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9993, - "step": 909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4433, - "step": 910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5863, - "step": 911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.505, - "step": 912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9734, - "step": 913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1792, - "step": 914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4574, - "step": 915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2787, - "step": 916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8201, - "step": 917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2537, - "step": 918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1387, - "step": 919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7161, - "step": 920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2207, - "step": 921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7953, - "step": 922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9949, - "step": 923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9173, - "step": 924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7903, - "step": 925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4784, - "step": 926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2264, - "step": 927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.566, - "step": 928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0686, - "step": 929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.791, - "step": 930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8393, - "step": 931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4387, - "step": 932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2374, - "step": 933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9598, - "step": 934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1597, - "step": 935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0403, - "step": 936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3301, - "step": 937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.072, - "step": 938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4788, - "step": 939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0656, - "step": 940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9647, - "step": 941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1168, - "step": 942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0293, - "step": 943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3622, - "step": 944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8957, - "step": 945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4, - "step": 946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6626, - "step": 947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8212, - "step": 948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8638, - "step": 949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6406, - "step": 950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7069, - "step": 951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1384, - "step": 952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.612, - "step": 953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3532, - "step": 955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1266, - "step": 956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6192, - "step": 957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.826, - "step": 958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9338, - "step": 959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4487, - "step": 960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.872, - "step": 961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8601, - "step": 962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7401, - "step": 963 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5412, - "step": 964 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2501, - "step": 965 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6837, - "step": 966 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6494, - "step": 967 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.604, - "step": 968 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.837, - "step": 969 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3957, - "step": 970 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3281, - "step": 971 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8264, - "step": 972 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6554, - "step": 973 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5768, - "step": 974 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4187, - "step": 975 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8479, - "step": 976 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9849, - "step": 977 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6471, - "step": 978 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8041, - "step": 979 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8876, - "step": 980 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6423, - "step": 981 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5329, - "step": 982 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2801, - "step": 983 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1699, - "step": 984 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6469, - "step": 985 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6766, - "step": 986 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7538, - "step": 987 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9606, - "step": 988 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0713, - "step": 989 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4965, - "step": 990 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3408, - "step": 991 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4007, - "step": 992 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8921, - "step": 993 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 994 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.8867, - "step": 995 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.467, - "step": 996 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7895, - "step": 997 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0523, - "step": 998 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4032, - "step": 999 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7719, - "step": 1000 - }, - { - "epoch": 0.01, - "eval_loss": 6.766034126281738, - "eval_runtime": 22.4042, - "eval_samples_per_second": 2.232, - "eval_steps_per_second": 1.116, - "step": 1000 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.338861379623413, - "step": 1000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0285, - "step": 1001 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4571, - "step": 1002 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7721, - "step": 1003 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5108, - "step": 1004 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3813, - "step": 1005 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7963, - "step": 1006 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1101, - "step": 1007 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.021, - "step": 1008 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5916, - "step": 1009 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8813, - "step": 1010 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1193, - "step": 1011 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5092, - "step": 1012 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8569, - "step": 1013 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.119, - "step": 1014 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3247, - "step": 1015 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2358, - "step": 1016 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2795, - "step": 1017 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3466, - "step": 1018 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5443, - "step": 1019 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7296, - "step": 1020 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0412, - "step": 1021 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4829, - "step": 1022 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7901, - "step": 1023 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8077, - "step": 1024 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4887, - "step": 1025 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3095, - "step": 1026 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3235, - "step": 1027 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6315, - "step": 1028 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4294, - "step": 1029 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8457, - "step": 1030 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7583, - "step": 1031 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3129, - "step": 1032 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1832, - "step": 1033 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1764, - "step": 1034 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0101, - "step": 1035 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6524, - "step": 1036 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 1037 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2262, - "step": 1038 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2533, - "step": 1039 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8794, - "step": 1040 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7901, - "step": 1041 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 1042 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5888, - "step": 1043 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8932, - "step": 1044 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2999, - "step": 1045 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8396, - "step": 1046 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4209, - "step": 1047 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1524, - "step": 1048 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7784, - "step": 1049 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 1050 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1153, - "step": 1051 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2149, - "step": 1052 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0117, - "step": 1053 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9693, - "step": 1054 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5656, - "step": 1055 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5, - "step": 1056 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.102, - "step": 1057 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3079, - "step": 1058 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5754, - "step": 1059 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6989, - "step": 1060 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9597, - "step": 1061 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3743, - "step": 1062 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8887, - "step": 1063 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3779, - "step": 1064 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5001, - "step": 1065 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4095, - "step": 1066 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5073, - "step": 1067 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1331, - "step": 1068 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.323, - "step": 1069 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6116, - "step": 1070 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1212, - "step": 1071 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0951, - "step": 1072 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2463, - "step": 1073 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4488, - "step": 1074 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.279, - "step": 1075 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5728, - "step": 1076 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1362, - "step": 1077 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6648, - "step": 1078 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.427, - "step": 1079 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8145, - "step": 1080 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5308, - "step": 1081 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.974, - "step": 1082 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1965, - "step": 1083 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8749, - "step": 1084 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7352, - "step": 1085 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7934, - "step": 1086 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6003, - "step": 1087 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5775, - "step": 1088 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.519, - "step": 1089 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7403, - "step": 1090 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8145, - "step": 1091 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5776, - "step": 1092 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3753, - "step": 1093 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9586, - "step": 1094 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7263, - "step": 1095 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7034, - "step": 1096 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0579, - "step": 1097 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8419, - "step": 1098 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0751, - "step": 1099 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6438, - "step": 1100 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8744, - "step": 1101 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4992, - "step": 1102 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8094, - "step": 1103 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.162, - "step": 1104 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8351, - "step": 1105 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8845, - "step": 1106 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1894, - "step": 1107 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.8333, - "step": 1108 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4226, - "step": 1109 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0712, - "step": 1110 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9981, - "step": 1111 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5885, - "step": 1112 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1915, - "step": 1113 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8003, - "step": 1114 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 1115 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4085, - "step": 1116 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0793, - "step": 1117 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0909, - "step": 1118 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2273, - "step": 1119 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8273, - "step": 1120 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0231, - "step": 1121 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 1122 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4479, - "step": 1123 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2178, - "step": 1124 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9038, - "step": 1125 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2653, - "step": 1126 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2974, - "step": 1127 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3003, - "step": 1128 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7853, - "step": 1129 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9143, - "step": 1130 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2573, - "step": 1131 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7091, - "step": 1132 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3372, - "step": 1133 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4165, - "step": 1134 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4422, - "step": 1135 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7693, - "step": 1136 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7802, - "step": 1137 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7263, - "step": 1138 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6749, - "step": 1139 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9459, - "step": 1140 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9697, - "step": 1141 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4506, - "step": 1142 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5099, - "step": 1143 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1475, - "step": 1144 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3769, - "step": 1145 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2035, - "step": 1146 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6017, - "step": 1147 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.463, - "step": 1148 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3844, - "step": 1149 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5306, - "step": 1150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5502, - "step": 1151 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7394, - "step": 1152 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5626, - "step": 1153 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1618, - "step": 1154 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5174, - "step": 1155 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1038, - "step": 1156 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3789, - "step": 1157 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2985, - "step": 1158 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4763, - "step": 1159 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 1160 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0827, - "step": 1161 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7349, - "step": 1162 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.798, - "step": 1163 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3176, - "step": 1164 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8114, - "step": 1165 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3379, - "step": 1166 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1157, - "step": 1167 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4675, - "step": 1168 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2721, - "step": 1169 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0603, - "step": 1170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6358, - "step": 1171 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0865, - "step": 1172 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.709, - "step": 1173 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7705, - "step": 1174 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7677, - "step": 1175 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2418, - "step": 1176 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7114, - "step": 1177 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1165, - "step": 1178 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9654, - "step": 1179 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0672, - "step": 1180 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1738, - "step": 1181 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7604, - "step": 1182 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8426, - "step": 1183 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0231, - "step": 1184 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2938, - "step": 1185 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.783, - "step": 1186 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3328, - "step": 1187 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.321, - "step": 1188 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6368, - "step": 1189 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.101, - "step": 1190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6777, - "step": 1191 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0831, - "step": 1192 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5853, - "step": 1193 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7923, - "step": 1194 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3734, - "step": 1195 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4268, - "step": 1196 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6796, - "step": 1197 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9028, - "step": 1198 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3716, - "step": 1199 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6761, - "step": 1200 - }, - { - "epoch": 0.01, - "eval_loss": 6.9188361167907715, - "eval_runtime": 22.426, - "eval_samples_per_second": 2.23, - "eval_steps_per_second": 1.115, - "step": 1200 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 3.3686839294433595, - "step": 1200 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8855, - "step": 1201 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8206, - "step": 1202 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4401, - "step": 1203 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2366, - "step": 1204 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9885, - "step": 1205 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5444, - "step": 1206 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4259, - "step": 1207 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5369, - "step": 1208 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0839, - "step": 1209 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7622, - "step": 1210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8979, - "step": 1211 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5508, - "step": 1212 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6439, - "step": 1213 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6249, - "step": 1214 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.495, - "step": 1215 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0642, - "step": 1216 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1217 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6275, - "step": 1218 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3317, - "step": 1219 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4635, - "step": 1220 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5197, - "step": 1221 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5928, - "step": 1222 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2363, - "step": 1223 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0266, - "step": 1224 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3356, - "step": 1225 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7927, - "step": 1226 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6952, - "step": 1227 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8878, - "step": 1228 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7472, - "step": 1229 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6454, - "step": 1230 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4972, - "step": 1231 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3347, - "step": 1232 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1631, - "step": 1233 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4708, - "step": 1234 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5697, - "step": 1235 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8218, - "step": 1236 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.269, - "step": 1237 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4165, - "step": 1238 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3653, - "step": 1239 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0152, - "step": 1240 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9157, - "step": 1241 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4086, - "step": 1242 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2493, - "step": 1243 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8279, - "step": 1244 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6649, - "step": 1245 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4405, - "step": 1246 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1992, - "step": 1247 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2055, - "step": 1248 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4395, - "step": 1249 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2475, - "step": 1250 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8205, - "step": 1251 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1053, - "step": 1252 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7494, - "step": 1253 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7387, - "step": 1254 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8983, - "step": 1255 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5614, - "step": 1256 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7617, - "step": 1257 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2445, - "step": 1258 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3043, - "step": 1259 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4214, - "step": 1260 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1384, - "step": 1261 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3914, - "step": 1262 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3287, - "step": 1263 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2174, - "step": 1264 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4397, - "step": 1265 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6875, - "step": 1266 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4512, - "step": 1267 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2834, - "step": 1268 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7651, - "step": 1269 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9263, - "step": 1270 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6721, - "step": 1271 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9178, - "step": 1272 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7967, - "step": 1273 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5242, - "step": 1274 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7794, - "step": 1275 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4256, - "step": 1276 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5788, - "step": 1277 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7586, - "step": 1278 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.964, - "step": 1279 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0749, - "step": 1280 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6248, - "step": 1281 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2465, - "step": 1282 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1591, - "step": 1283 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4328, - "step": 1284 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.534, - "step": 1285 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.523, - "step": 1286 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5672, - "step": 1287 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9162, - "step": 1288 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1089, - "step": 1289 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3287, - "step": 1290 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2499, - "step": 1291 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9645, - "step": 1292 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3903, - "step": 1293 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5322, - "step": 1294 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2211, - "step": 1295 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2788, - "step": 1296 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1862, - "step": 1297 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2678, - "step": 1298 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5399, - "step": 1299 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7935, - "step": 1300 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0391, - "step": 1301 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1049, - "step": 1302 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.365, - "step": 1303 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8809, - "step": 1304 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2335, - "step": 1305 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.5135, - "step": 1306 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2378, - "step": 1307 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9265, - "step": 1308 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.641, - "step": 1309 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9822, - "step": 1310 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3369, - "step": 1311 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3735, - "step": 1312 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2618, - "step": 1313 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6854, - "step": 1314 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3748, - "step": 1315 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9206, - "step": 1316 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1969, - "step": 1317 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1245, - "step": 1318 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9977, - "step": 1319 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5319, - "step": 1320 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 1321 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7264, - "step": 1322 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.05, - "step": 1323 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3118, - "step": 1324 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4575, - "step": 1325 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.593, - "step": 1326 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0061, - "step": 1327 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2481, - "step": 1328 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8017, - "step": 1329 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8617, - "step": 1330 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7036, - "step": 1331 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0091, - "step": 1332 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9687, - "step": 1333 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3925, - "step": 1334 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 1335 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8163, - "step": 1336 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0639, - "step": 1337 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8929, - "step": 1338 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5011, - "step": 1339 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1340 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0526, - "step": 1341 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4428, - "step": 1342 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3477, - "step": 1343 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.881, - "step": 1344 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5276, - "step": 1345 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4183, - "step": 1346 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4943, - "step": 1347 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9187, - "step": 1348 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1003, - "step": 1349 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1187, - "step": 1350 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8081, - "step": 1351 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4695, - "step": 1352 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5761, - "step": 1353 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9635, - "step": 1354 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2133, - "step": 1355 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2611, - "step": 1356 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6885, - "step": 1357 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1157, - "step": 1358 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4421, - "step": 1359 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2128, - "step": 1360 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6978, - "step": 1361 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9804, - "step": 1362 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 1363 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2676, - "step": 1364 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.325, - "step": 1365 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1263, - "step": 1366 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7481, - "step": 1367 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6891, - "step": 1368 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8568, - "step": 1369 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9893, - "step": 1370 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0302, - "step": 1371 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3204, - "step": 1372 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9008, - "step": 1373 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2624, - "step": 1374 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6234, - "step": 1375 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2286, - "step": 1376 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3426, - "step": 1377 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1962, - "step": 1378 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3142, - "step": 1379 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.414, - "step": 1380 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0191, - "step": 1381 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4953, - "step": 1382 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6694, - "step": 1383 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8611, - "step": 1384 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.86, - "step": 1385 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6519, - "step": 1386 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.394, - "step": 1387 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2117, - "step": 1388 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9924, - "step": 1389 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.58, - "step": 1390 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4415, - "step": 1391 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7196, - "step": 1392 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7388, - "step": 1393 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4784, - "step": 1394 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.496, - "step": 1395 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8706, - "step": 1396 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1858, - "step": 1397 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9038, - "step": 1398 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4852, - "step": 1399 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2717, - "step": 1400 - }, - { - "epoch": 0.01, - "eval_loss": 6.97923469543457, - "eval_runtime": 22.472, - "eval_samples_per_second": 2.225, - "eval_steps_per_second": 1.112, - "step": 1400 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.657382688522339, - "step": 1400 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.843, - "step": 1401 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5611, - "step": 1402 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2434, - "step": 1403 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3136, - "step": 1404 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.686, - "step": 1405 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6365, - "step": 1406 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1811, - "step": 1407 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7537, - "step": 1408 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2949, - "step": 1409 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4827, - "step": 1410 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0965, - "step": 1411 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.007, - "step": 1412 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2861, - "step": 1413 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1774, - "step": 1414 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7777, - "step": 1415 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0259, - "step": 1416 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9024, - "step": 1417 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4786, - "step": 1418 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5873, - "step": 1419 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2744, - "step": 1420 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9484, - "step": 1421 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2093, - "step": 1422 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3394, - "step": 1423 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1247, - "step": 1424 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0691, - "step": 1425 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.559, - "step": 1426 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1518, - "step": 1427 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4143, - "step": 1428 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0287, - "step": 1429 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8112, - "step": 1430 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2625, - "step": 1431 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3528, - "step": 1432 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2715, - "step": 1433 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7849, - "step": 1434 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2002, - "step": 1435 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0658, - "step": 1436 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0671, - "step": 1437 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2577, - "step": 1438 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.803, - "step": 1439 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1440 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0897, - "step": 1441 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0805, - "step": 1442 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7681, - "step": 1443 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6565, - "step": 1444 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0174, - "step": 1445 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8507, - "step": 1446 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2105, - "step": 1447 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.572, - "step": 1448 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2904, - "step": 1449 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4623, - "step": 1450 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4774, - "step": 1451 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1277, - "step": 1452 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6204, - "step": 1453 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3219, - "step": 1454 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2517, - "step": 1455 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3026, - "step": 1456 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 1457 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5256, - "step": 1458 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9316, - "step": 1459 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.631, - "step": 1460 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2888, - "step": 1461 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5511, - "step": 1462 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.9799, - "step": 1463 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6982, - "step": 1464 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4923, - "step": 1465 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8329, - "step": 1466 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2733, - "step": 1467 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8221, - "step": 1468 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.363, - "step": 1469 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6348, - "step": 1470 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3319, - "step": 1471 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6768, - "step": 1472 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1985, - "step": 1473 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6109, - "step": 1474 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.974, - "step": 1475 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8902, - "step": 1476 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6762, - "step": 1477 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 1478 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3867, - "step": 1479 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9624, - "step": 1480 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8768, - "step": 1481 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7598, - "step": 1482 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6522, - "step": 1483 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8156, - "step": 1484 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3791, - "step": 1485 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2178, - "step": 1486 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8448, - "step": 1487 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5377, - "step": 1488 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7407, - "step": 1489 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7636, - "step": 1490 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4325, - "step": 1491 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 1492 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0626, - "step": 1493 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.255, - "step": 1494 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2802, - "step": 1495 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.894, - "step": 1496 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6482, - "step": 1497 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8903, - "step": 1498 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8672, - "step": 1499 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6079, - "step": 1500 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6217, - "step": 1501 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2361, - "step": 1502 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3962, - "step": 1503 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0036, - "step": 1504 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5926, - "step": 1505 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.114, - "step": 1506 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4419, - "step": 1507 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7838, - "step": 1508 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6635, - "step": 1509 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2906, - "step": 1510 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4186, - "step": 1511 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4783, - "step": 1512 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1226, - "step": 1513 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2458, - "step": 1514 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5302, - "step": 1515 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1515, - "step": 1516 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4182, - "step": 1517 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8248, - "step": 1518 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2349, - "step": 1519 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9314, - "step": 1520 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1161, - "step": 1521 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4183, - "step": 1522 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1523 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5588, - "step": 1524 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8026, - "step": 1525 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7695, - "step": 1526 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3636, - "step": 1527 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2776, - "step": 1528 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5386, - "step": 1529 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.521, - "step": 1530 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8388, - "step": 1531 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3561, - "step": 1532 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9606, - "step": 1533 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9438, - "step": 1534 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7665, - "step": 1535 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5826, - "step": 1536 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.0798, - "step": 1537 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8545, - "step": 1538 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.302, - "step": 1539 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 1540 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5021, - "step": 1541 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9384, - "step": 1542 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8761, - "step": 1543 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3316, - "step": 1544 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2051, - "step": 1545 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7907, - "step": 1546 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2534, - "step": 1547 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2274, - "step": 1548 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9226, - "step": 1549 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2502, - "step": 1550 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2703, - "step": 1551 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4359, - "step": 1552 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.128, - "step": 1553 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3147, - "step": 1554 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.026, - "step": 1555 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9393, - "step": 1556 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7753, - "step": 1557 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9049, - "step": 1558 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0538, - "step": 1559 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8691, - "step": 1560 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9377, - "step": 1561 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8385, - "step": 1562 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.939, - "step": 1563 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.727, - "step": 1564 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7866, - "step": 1565 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2439, - "step": 1566 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9607, - "step": 1567 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3505, - "step": 1568 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7729, - "step": 1569 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4669, - "step": 1570 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8178, - "step": 1571 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2173, - "step": 1572 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2136, - "step": 1573 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2888, - "step": 1574 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0386, - "step": 1575 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9041, - "step": 1576 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7544, - "step": 1577 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3229, - "step": 1578 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4203, - "step": 1579 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.497, - "step": 1580 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8253, - "step": 1581 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0801, - "step": 1582 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1585, - "step": 1583 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6965, - "step": 1584 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.498, - "step": 1585 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8697, - "step": 1586 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2663, - "step": 1587 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7004, - "step": 1588 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6561, - "step": 1589 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.785, - "step": 1590 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5708, - "step": 1591 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.326, - "step": 1592 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2974, - "step": 1593 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1408, - "step": 1594 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6526, - "step": 1595 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4116, - "step": 1596 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0484, - "step": 1597 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3162, - "step": 1598 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3806, - "step": 1599 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0251, - "step": 1600 - }, - { - "epoch": 0.01, - "eval_loss": 6.617897987365723, - "eval_runtime": 22.4646, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 1600 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.160770101547241, - "step": 1600 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9137, - "step": 1601 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2188, - "step": 1602 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7688, - "step": 1603 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9799, - "step": 1604 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5429, - "step": 1605 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8559, - "step": 1606 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3985, - "step": 1607 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.9139, - "step": 1608 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3303, - "step": 1609 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5168, - "step": 1610 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5194, - "step": 1611 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9557, - "step": 1612 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7102, - "step": 1613 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8961, - "step": 1614 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6123, - "step": 1615 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7808, - "step": 1616 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4919, - "step": 1617 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0723, - "step": 1618 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2931, - "step": 1619 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8478, - "step": 1620 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7126, - "step": 1621 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6622, - "step": 1622 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3861, - "step": 1623 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9856, - "step": 1624 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5076, - "step": 1625 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4168, - "step": 1626 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2825, - "step": 1627 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7497, - "step": 1628 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5672, - "step": 1629 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4095, - "step": 1630 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.649, - "step": 1631 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3418, - "step": 1632 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1337, - "step": 1633 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3829, - "step": 1634 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0294, - "step": 1635 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2164, - "step": 1636 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3294, - "step": 1637 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7047, - "step": 1638 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5622, - "step": 1639 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4873, - "step": 1640 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6641, - "step": 1641 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3943, - "step": 1642 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2266, - "step": 1643 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0471, - "step": 1644 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5658, - "step": 1645 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6489, - "step": 1646 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3851, - "step": 1647 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7921, - "step": 1648 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4581, - "step": 1649 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1407, - "step": 1650 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2919, - "step": 1651 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4061, - "step": 1652 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3081, - "step": 1653 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0527, - "step": 1654 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8729, - "step": 1655 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.029, - "step": 1656 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 1657 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7047, - "step": 1658 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6288, - "step": 1659 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8466, - "step": 1660 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7726, - "step": 1661 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.245, - "step": 1662 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0538, - "step": 1663 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3611, - "step": 1664 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.011, - "step": 1665 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6491, - "step": 1666 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3409, - "step": 1667 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.262, - "step": 1668 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.781, - "step": 1669 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.8025, - "step": 1670 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7529, - "step": 1671 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2322, - "step": 1672 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4527, - "step": 1673 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9457, - "step": 1674 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.859, - "step": 1675 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9207, - "step": 1676 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5378, - "step": 1677 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6585, - "step": 1678 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9523, - "step": 1679 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1348, - "step": 1680 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9582, - "step": 1681 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.416, - "step": 1682 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8214, - "step": 1683 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8833, - "step": 1684 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.1021, - "step": 1685 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7392, - "step": 1686 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2616, - "step": 1687 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.325, - "step": 1688 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3814, - "step": 1689 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2816, - "step": 1690 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.033, - "step": 1691 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.5742, - "step": 1692 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0841, - "step": 1693 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2888, - "step": 1694 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9263, - "step": 1695 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7552, - "step": 1696 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4163, - "step": 1697 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6207, - "step": 1698 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.938, - "step": 1699 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2925, - "step": 1700 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0401, - "step": 1701 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1536, - "step": 1702 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2754, - "step": 1703 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6765, - "step": 1704 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.63, - "step": 1705 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6902, - "step": 1706 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6434, - "step": 1707 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2283, - "step": 1708 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9377, - "step": 1709 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.371, - "step": 1710 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.6569, - "step": 1711 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.2221, - "step": 1712 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5375, - "step": 1713 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2189, - "step": 1714 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.769, - "step": 1715 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0089, - "step": 1716 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6402, - "step": 1717 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4812, - "step": 1718 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9754, - "step": 1719 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8435, - "step": 1720 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9424, - "step": 1721 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5465, - "step": 1722 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.477, - "step": 1723 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2254, - "step": 1724 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3663, - "step": 1725 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.663, - "step": 1726 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6951, - "step": 1727 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.856, - "step": 1728 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 1729 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6929, - "step": 1730 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8069, - "step": 1731 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.02, - "step": 1732 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0782, - "step": 1733 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0236, - "step": 1734 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2769, - "step": 1735 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7126, - "step": 1736 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2746, - "step": 1737 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8726, - "step": 1738 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7962, - "step": 1739 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7602, - "step": 1740 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 10.3105, - "step": 1741 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0771, - "step": 1742 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4738, - "step": 1743 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2952, - "step": 1744 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.2692, - "step": 1745 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7201, - "step": 1746 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2978, - "step": 1747 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.518, - "step": 1748 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.659, - "step": 1749 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.9101, - "step": 1750 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8397, - "step": 1751 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0451, - "step": 1752 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7696, - "step": 1753 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1377, - "step": 1754 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.2621, - "step": 1755 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2143, - "step": 1756 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4378, - "step": 1757 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8631, - "step": 1758 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.019, - "step": 1759 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.7475, - "step": 1760 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6228, - "step": 1761 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0703, - "step": 1762 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3426, - "step": 1763 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.0842, - "step": 1764 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1032, - "step": 1765 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6321, - "step": 1766 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7804, - "step": 1767 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6566, - "step": 1768 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4985, - "step": 1769 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1129, - "step": 1770 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.8081, - "step": 1771 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8441, - "step": 1772 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4054, - "step": 1773 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6334, - "step": 1774 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.4323, - "step": 1775 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.18, - "step": 1776 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7573, - "step": 1777 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4642, - "step": 1778 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.038, - "step": 1779 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3785, - "step": 1780 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5088, - "step": 1781 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0139, - "step": 1782 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0999, - "step": 1783 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3224, - "step": 1784 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.634, - "step": 1785 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1264, - "step": 1786 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.317, - "step": 1787 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1279, - "step": 1788 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2364, - "step": 1789 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0627, - "step": 1790 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2471, - "step": 1791 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8407, - "step": 1792 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7083, - "step": 1793 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4522, - "step": 1794 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0308, - "step": 1795 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6915, - "step": 1796 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.082, - "step": 1797 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7956, - "step": 1798 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7007, - "step": 1799 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9197, - "step": 1800 - }, - { - "epoch": 0.01, - "eval_loss": 6.619495868682861, - "eval_runtime": 22.4352, - "eval_samples_per_second": 2.229, - "eval_steps_per_second": 1.114, - "step": 1800 - }, - { - "epoch": 0.01, - "mmlu_eval_accuracy": 0.3260281385281385, - "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.25, - "mmlu_eval_accuracy_business_ethics": 0.3333333333333333, - "mmlu_loss": 4.238778591156006, - "step": 1800 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1537, - "step": 1801 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.684, - "step": 1802 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7862, - "step": 1803 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3518, - "step": 1804 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.1795, - "step": 1805 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.0054, - "step": 1806 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8997, - "step": 1807 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9002, - "step": 1808 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2805, - "step": 1809 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1203, - "step": 1810 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0206, - "step": 1811 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0151, - "step": 1812 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3864, - "step": 1813 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1117, - "step": 1814 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8487, - "step": 1815 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.59, - "step": 1816 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.1615, - "step": 1817 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7362, - "step": 1818 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2294, - "step": 1819 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5622, - "step": 1820 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5437, - "step": 1821 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.093, - "step": 1822 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0343, - "step": 1823 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4454, - "step": 1824 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.5138, - "step": 1825 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5605, - "step": 1826 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.322, - "step": 1827 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6489, - "step": 1828 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.331, - "step": 1829 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6462, - "step": 1830 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.932, - "step": 1831 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.9058, - "step": 1832 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.3433, - "step": 1833 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4365, - "step": 1834 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.3282, - "step": 1835 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.448, - "step": 1836 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5369, - "step": 1837 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.177, - "step": 1838 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.3552, - "step": 1839 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 1840 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0602, - "step": 1841 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7449, - "step": 1842 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.2675, - "step": 1843 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.0317, - "step": 1844 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.4342, - "step": 1845 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8688, - "step": 1846 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3571, - "step": 1847 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.3776, - "step": 1848 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.2248, - "step": 1849 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6073, - "step": 1850 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8425, - "step": 1851 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5954, - "step": 1852 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4197, - "step": 1853 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8624, - "step": 1854 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9652, - "step": 1855 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7145, - "step": 1856 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5309, - "step": 1857 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.4356, - "step": 1858 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6508, - "step": 1859 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0955, - "step": 1860 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6886, - "step": 1861 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7644, - "step": 1862 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5709, - "step": 1863 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6212, - "step": 1864 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6325, - "step": 1865 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.6805, - "step": 1866 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1464, - "step": 1867 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9244, - "step": 1868 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.336, - "step": 1869 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.8783, - "step": 1870 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8236, - "step": 1871 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.084, - "step": 1872 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9639, - "step": 1873 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4173, - "step": 1874 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0042, - "step": 1875 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.2519, - "step": 1876 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4656, - "step": 1877 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5136, - "step": 1878 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.3918, - "step": 1879 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9696, - "step": 1880 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.9736, - "step": 1881 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6192, - "step": 1882 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3476, - "step": 1883 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3048, - "step": 1884 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1116, - "step": 1885 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.971, - "step": 1886 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.0741, - "step": 1887 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1418, - "step": 1888 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3487, - "step": 1889 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.38, - "step": 1890 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6561, - "step": 1891 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.5606, - "step": 1892 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.8623, - "step": 1893 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.2984, - "step": 1894 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6179, - "step": 1895 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8625, - "step": 1896 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.8596, - "step": 1897 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.7205, - "step": 1898 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.6727, - "step": 1899 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.016, - "step": 1900 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.9868, - "step": 1901 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 1902 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.5133, - "step": 1903 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7476, - "step": 1904 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4174, - "step": 1905 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.6789, - "step": 1906 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4534, - "step": 1907 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3335, - "step": 1908 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.7921, - "step": 1909 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9567, - "step": 1910 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.1739, - "step": 1911 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7514, - "step": 1912 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3858, - "step": 1913 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0462, - "step": 1914 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3817, - "step": 1915 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 1.9739, - "step": 1916 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.1122, - "step": 1917 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.3361, - "step": 1918 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.3184, - "step": 1919 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.7342, - "step": 1920 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.375, - "step": 1921 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.6841, - "step": 1922 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0773, - "step": 1923 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.8916, - "step": 1924 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.7176, - "step": 1925 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8841, - "step": 1926 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.8345, - "step": 1927 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.561, - "step": 1928 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5392, - "step": 1929 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.1627, - "step": 1930 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.0657, - "step": 1931 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.7385, - "step": 1932 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.5533, - "step": 1933 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0925, - "step": 1934 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8752, - "step": 1935 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4039, - "step": 1936 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.6472, - "step": 1937 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.1819, - "step": 1938 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.5919, - "step": 1939 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.6527, - "step": 1940 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.5188, - "step": 1941 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.9856, - "step": 1942 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.7038, - "step": 1943 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.911, - "step": 1944 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.497, - "step": 1945 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.1804, - "step": 1946 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 1947 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 6.0433, - "step": 1948 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4706, - "step": 1949 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.5896, - "step": 1950 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.557, - "step": 1951 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 1952 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.7865, - "step": 1953 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 8.0797, - "step": 1954 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2896, - "step": 1955 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.4096, - "step": 1956 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 3.9538, - "step": 1957 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.2778, - "step": 1958 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 7.4968, - "step": 1959 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.8328, - "step": 1960 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 9.4597, - "step": 1961 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 1962 - }, - { - "epoch": 0.01, - "learning_rate": 0.0004, - "loss": 5.4861, - "step": 1963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5831, - "step": 1964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4585, - "step": 1965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7898, - "step": 1966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8714, - "step": 1967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.752, - "step": 1968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9024, - "step": 1969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.058, - "step": 1970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1745, - "step": 1971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2162, - "step": 1972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 1973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3307, - "step": 1974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3285, - "step": 1975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1353, - "step": 1976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8069, - "step": 1977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6885, - "step": 1978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5946, - "step": 1979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6828, - "step": 1980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6516, - "step": 1981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.261, - "step": 1982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.524, - "step": 1983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.585, - "step": 1984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8883, - "step": 1985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.689, - "step": 1986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1083, - "step": 1987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1606, - "step": 1988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9243, - "step": 1989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6597, - "step": 1990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2849, - "step": 1991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3715, - "step": 1992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7262, - "step": 1993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6862, - "step": 1994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5412, - "step": 1995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7483, - "step": 1996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3391, - "step": 1997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2642, - "step": 1998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1519, - "step": 1999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7098, - "step": 2000 - }, - { - "epoch": 0.02, - "eval_loss": 6.762476921081543, - "eval_runtime": 22.4899, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.112, - "step": 2000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.4606559085845947, - "step": 2000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8099, - "step": 2001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0567, - "step": 2002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2981, - "step": 2003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2668, - "step": 2004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.139, - "step": 2005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.903, - "step": 2006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2182, - "step": 2007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2347, - "step": 2008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8383, - "step": 2009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0211, - "step": 2010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2572, - "step": 2011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2877, - "step": 2012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3577, - "step": 2013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2022, - "step": 2014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2722, - "step": 2015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0552, - "step": 2016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9857, - "step": 2017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0519, - "step": 2018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7118, - "step": 2019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4465, - "step": 2020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3009, - "step": 2021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3614, - "step": 2022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3493, - "step": 2023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.34, - "step": 2024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0416, - "step": 2025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.686, - "step": 2026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6021, - "step": 2027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4161, - "step": 2028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0029, - "step": 2029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8579, - "step": 2030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0247, - "step": 2031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4184, - "step": 2032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4962, - "step": 2033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5137, - "step": 2034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6692, - "step": 2035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7161, - "step": 2036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.617, - "step": 2037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.413, - "step": 2038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3056, - "step": 2039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9441, - "step": 2040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9943, - "step": 2041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5703, - "step": 2042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1881, - "step": 2043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5763, - "step": 2044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6389, - "step": 2045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1717, - "step": 2046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5482, - "step": 2047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9469, - "step": 2048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7685, - "step": 2049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1381, - "step": 2050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6961, - "step": 2051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6425, - "step": 2052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5354, - "step": 2053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2404, - "step": 2054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1556, - "step": 2055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7133, - "step": 2056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8166, - "step": 2057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5071, - "step": 2058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5429, - "step": 2059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0367, - "step": 2060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5386, - "step": 2061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5899, - "step": 2062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2968, - "step": 2063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9951, - "step": 2064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8608, - "step": 2065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4735, - "step": 2066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5612, - "step": 2067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7461, - "step": 2068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5887, - "step": 2069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3426, - "step": 2070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5589, - "step": 2071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.498, - "step": 2072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1306, - "step": 2073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3492, - "step": 2074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2311, - "step": 2075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8798, - "step": 2076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6799, - "step": 2077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5011, - "step": 2078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8892, - "step": 2079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6449, - "step": 2080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9117, - "step": 2081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1157, - "step": 2082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.196, - "step": 2083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9364, - "step": 2084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3618, - "step": 2085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3755, - "step": 2086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4564, - "step": 2087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4912, - "step": 2088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.113, - "step": 2089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0588, - "step": 2090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.668, - "step": 2091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.08, - "step": 2092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2042, - "step": 2093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4134, - "step": 2094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0456, - "step": 2095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2245, - "step": 2096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4936, - "step": 2097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5158, - "step": 2098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7077, - "step": 2100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6008, - "step": 2101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4652, - "step": 2102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.918, - "step": 2103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.5819, - "step": 2104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7764, - "step": 2105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0525, - "step": 2106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5359, - "step": 2107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4925, - "step": 2108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4857, - "step": 2109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9445, - "step": 2110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8494, - "step": 2111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1513, - "step": 2112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2552, - "step": 2113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8571, - "step": 2115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5968, - "step": 2116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8806, - "step": 2117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4641, - "step": 2118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6039, - "step": 2119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1379, - "step": 2120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6688, - "step": 2121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.293, - "step": 2122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5664, - "step": 2123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0825, - "step": 2124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9788, - "step": 2125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9641, - "step": 2126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7799, - "step": 2127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0619, - "step": 2128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0022, - "step": 2129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8022, - "step": 2130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5301, - "step": 2131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.681, - "step": 2132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7362, - "step": 2133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5462, - "step": 2134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2356, - "step": 2135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3646, - "step": 2137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8765, - "step": 2138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6487, - "step": 2139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9622, - "step": 2140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1761, - "step": 2141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0371, - "step": 2143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7869, - "step": 2144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3725, - "step": 2145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8894, - "step": 2146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6083, - "step": 2147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4451, - "step": 2148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1149, - "step": 2149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8058, - "step": 2150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1308, - "step": 2151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1447, - "step": 2152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.208, - "step": 2153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5193, - "step": 2154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7729, - "step": 2155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5019, - "step": 2156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6092, - "step": 2157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1853, - "step": 2158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7, - "step": 2159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1638, - "step": 2160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.762, - "step": 2161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7455, - "step": 2162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9372, - "step": 2163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4569, - "step": 2164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6705, - "step": 2165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1988, - "step": 2166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2526, - "step": 2167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9066, - "step": 2168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1365, - "step": 2169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3422, - "step": 2170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2691, - "step": 2171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9008, - "step": 2172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2555, - "step": 2173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0886, - "step": 2174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0369, - "step": 2175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5566, - "step": 2176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2567, - "step": 2177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0179, - "step": 2178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5383, - "step": 2179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4797, - "step": 2180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0163, - "step": 2181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2658, - "step": 2182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1337, - "step": 2183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3287, - "step": 2184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7874, - "step": 2185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7153, - "step": 2186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7037, - "step": 2187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4412, - "step": 2188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3912, - "step": 2189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.034, - "step": 2190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4697, - "step": 2191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6243, - "step": 2192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1133, - "step": 2193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9005, - "step": 2194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7386, - "step": 2195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4169, - "step": 2196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8139, - "step": 2197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3012, - "step": 2198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8223, - "step": 2199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3757, - "step": 2200 - }, - { - "epoch": 0.02, - "eval_loss": 6.580160140991211, - "eval_runtime": 22.4971, - "eval_samples_per_second": 2.223, - "eval_steps_per_second": 1.111, - "step": 2200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.755114164352417, - "step": 2200 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5282, - "step": 2201 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2478, - "step": 2202 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.916, - "step": 2203 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5069, - "step": 2204 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5952, - "step": 2205 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5059, - "step": 2206 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7434, - "step": 2207 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.625, - "step": 2208 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1674, - "step": 2209 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3937, - "step": 2210 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8783, - "step": 2211 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5263, - "step": 2212 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7887, - "step": 2213 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8911, - "step": 2214 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7211, - "step": 2215 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.089, - "step": 2216 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6373, - "step": 2217 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7728, - "step": 2218 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6957, - "step": 2219 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.43, - "step": 2220 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9673, - "step": 2221 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8942, - "step": 2222 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2893, - "step": 2223 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1505, - "step": 2224 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3702, - "step": 2225 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1731, - "step": 2226 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.997, - "step": 2227 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9531, - "step": 2228 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0748, - "step": 2229 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0642, - "step": 2230 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2231 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2265, - "step": 2232 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6461, - "step": 2233 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.064, - "step": 2234 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1414, - "step": 2235 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5375, - "step": 2236 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6348, - "step": 2237 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9975, - "step": 2238 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5242, - "step": 2239 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3179, - "step": 2240 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6054, - "step": 2241 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1832, - "step": 2242 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0572, - "step": 2243 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2049, - "step": 2244 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6348, - "step": 2245 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.67, - "step": 2246 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.5627, - "step": 2247 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1851, - "step": 2248 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6792, - "step": 2249 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6344, - "step": 2250 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7603, - "step": 2251 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7737, - "step": 2252 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5323, - "step": 2253 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4059, - "step": 2254 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9343, - "step": 2255 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0156, - "step": 2256 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1851, - "step": 2257 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.44, - "step": 2258 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9079, - "step": 2259 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2260 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 12.3777, - "step": 2261 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1265, - "step": 2262 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1428, - "step": 2263 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8072, - "step": 2264 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.911, - "step": 2265 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9453, - "step": 2266 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0168, - "step": 2267 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2098, - "step": 2268 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2269 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8449, - "step": 2270 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.394, - "step": 2271 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7642, - "step": 2272 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5555, - "step": 2273 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3576, - "step": 2274 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.386, - "step": 2275 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6677, - "step": 2276 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2385, - "step": 2277 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8074, - "step": 2278 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2963, - "step": 2279 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3612, - "step": 2280 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1837, - "step": 2281 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5882, - "step": 2282 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0968, - "step": 2283 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2376, - "step": 2284 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3835, - "step": 2285 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0143, - "step": 2286 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.36, - "step": 2287 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0121, - "step": 2288 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0144, - "step": 2289 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6807, - "step": 2290 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8854, - "step": 2291 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1727, - "step": 2292 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.533, - "step": 2293 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9793, - "step": 2294 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.538, - "step": 2295 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.145, - "step": 2296 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.163, - "step": 2297 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1045, - "step": 2298 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0209, - "step": 2299 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9728, - "step": 2300 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8902, - "step": 2301 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3075, - "step": 2302 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.2194, - "step": 2303 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7375, - "step": 2304 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3863, - "step": 2305 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1317, - "step": 2306 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1878, - "step": 2307 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6124, - "step": 2308 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8843, - "step": 2309 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3988, - "step": 2310 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3523, - "step": 2311 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5766, - "step": 2312 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9096, - "step": 2313 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9315, - "step": 2314 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4044, - "step": 2315 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6206, - "step": 2316 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2429, - "step": 2317 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0383, - "step": 2318 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4282, - "step": 2319 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8973, - "step": 2320 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1771, - "step": 2321 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.624, - "step": 2322 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5197, - "step": 2323 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7313, - "step": 2324 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8234, - "step": 2325 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1702, - "step": 2326 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.536, - "step": 2327 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1904, - "step": 2328 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2077, - "step": 2329 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.891, - "step": 2330 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6784, - "step": 2331 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6611, - "step": 2332 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3402, - "step": 2333 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 11.1523, - "step": 2334 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5547, - "step": 2335 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3485, - "step": 2336 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8289, - "step": 2337 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2558, - "step": 2338 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1794, - "step": 2339 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8782, - "step": 2340 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.415, - "step": 2341 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5257, - "step": 2342 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4751, - "step": 2343 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2259, - "step": 2344 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8681, - "step": 2345 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6307, - "step": 2346 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1487, - "step": 2347 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3949, - "step": 2348 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6988, - "step": 2349 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1299, - "step": 2350 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9938, - "step": 2351 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4176, - "step": 2352 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0184, - "step": 2353 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2779, - "step": 2354 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0162, - "step": 2355 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2335, - "step": 2356 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5505, - "step": 2357 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6445, - "step": 2358 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6513, - "step": 2359 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8503, - "step": 2360 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1817, - "step": 2361 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4376, - "step": 2362 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1351, - "step": 2363 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7566, - "step": 2364 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.626, - "step": 2365 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5818, - "step": 2366 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3033, - "step": 2367 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9289, - "step": 2368 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0301, - "step": 2369 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4713, - "step": 2370 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0931, - "step": 2371 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5812, - "step": 2372 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.2272, - "step": 2373 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5174, - "step": 2374 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1849, - "step": 2375 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7496, - "step": 2376 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.776, - "step": 2377 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3555, - "step": 2378 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.688, - "step": 2379 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0143, - "step": 2380 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7788, - "step": 2381 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7772, - "step": 2382 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6875, - "step": 2383 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9944, - "step": 2384 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8363, - "step": 2385 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7276, - "step": 2386 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4892, - "step": 2387 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1083, - "step": 2388 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.834, - "step": 2389 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8406, - "step": 2390 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1168, - "step": 2391 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2535, - "step": 2392 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9025, - "step": 2393 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4481, - "step": 2394 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7631, - "step": 2395 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2051, - "step": 2396 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7816, - "step": 2397 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2398 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1125, - "step": 2399 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5952, - "step": 2400 - }, - { - "epoch": 0.02, - "eval_loss": 6.616010665893555, - "eval_runtime": 22.4801, - "eval_samples_per_second": 2.224, - "eval_steps_per_second": 1.112, - "step": 2400 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.427501640319824, - "step": 2400 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6081, - "step": 2401 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2359, - "step": 2402 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2628, - "step": 2403 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8465, - "step": 2404 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6746, - "step": 2405 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1066, - "step": 2406 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4981, - "step": 2407 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9194, - "step": 2408 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.239, - "step": 2409 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2410 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4597, - "step": 2411 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5164, - "step": 2412 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4122, - "step": 2413 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7398, - "step": 2414 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5009, - "step": 2415 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2757, - "step": 2416 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4202, - "step": 2417 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.108, - "step": 2418 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3303, - "step": 2419 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4671, - "step": 2420 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5768, - "step": 2421 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9172, - "step": 2422 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7606, - "step": 2423 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0745, - "step": 2424 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2907, - "step": 2425 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6956, - "step": 2426 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4414, - "step": 2427 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9823, - "step": 2428 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6486, - "step": 2429 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5173, - "step": 2430 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4647, - "step": 2431 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9405, - "step": 2432 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4226, - "step": 2433 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4334, - "step": 2434 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9136, - "step": 2435 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6707, - "step": 2436 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6107, - "step": 2437 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5296, - "step": 2438 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0736, - "step": 2439 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4398, - "step": 2440 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5669, - "step": 2441 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.303, - "step": 2442 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2993, - "step": 2443 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9063, - "step": 2444 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3015, - "step": 2445 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3117, - "step": 2446 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6228, - "step": 2447 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6216, - "step": 2448 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6188, - "step": 2449 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8996, - "step": 2450 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5802, - "step": 2451 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2603, - "step": 2452 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0921, - "step": 2453 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9377, - "step": 2454 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0934, - "step": 2455 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9832, - "step": 2456 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1084, - "step": 2457 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2592, - "step": 2458 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8545, - "step": 2459 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4335, - "step": 2460 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5591, - "step": 2461 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.284, - "step": 2462 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8171, - "step": 2463 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8541, - "step": 2464 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1355, - "step": 2465 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6885, - "step": 2466 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.311, - "step": 2467 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.72, - "step": 2468 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.007, - "step": 2469 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2275, - "step": 2470 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.129, - "step": 2471 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9531, - "step": 2472 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7327, - "step": 2473 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5001, - "step": 2474 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9443, - "step": 2475 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6005, - "step": 2476 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5551, - "step": 2477 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3044, - "step": 2478 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6257, - "step": 2479 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5029, - "step": 2480 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3717, - "step": 2481 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5226, - "step": 2482 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2921, - "step": 2483 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7169, - "step": 2484 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2185, - "step": 2485 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5393, - "step": 2486 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0286, - "step": 2487 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3738, - "step": 2488 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2249, - "step": 2489 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7828, - "step": 2490 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.87, - "step": 2491 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.681, - "step": 2492 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5446, - "step": 2493 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0769, - "step": 2494 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3587, - "step": 2495 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9188, - "step": 2496 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9357, - "step": 2497 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3449, - "step": 2498 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2753, - "step": 2499 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3077, - "step": 2500 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0085, - "step": 2501 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5436, - "step": 2502 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9096, - "step": 2503 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7288, - "step": 2504 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7344, - "step": 2505 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6385, - "step": 2506 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6713, - "step": 2507 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6065, - "step": 2508 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3456, - "step": 2509 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1633, - "step": 2510 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5557, - "step": 2511 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7075, - "step": 2512 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4954, - "step": 2513 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5104, - "step": 2514 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5706, - "step": 2515 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7479, - "step": 2516 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7042, - "step": 2517 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9569, - "step": 2518 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7846, - "step": 2519 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.749, - "step": 2520 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5868, - "step": 2521 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3957, - "step": 2522 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2594, - "step": 2523 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.632, - "step": 2524 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.023, - "step": 2525 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0423, - "step": 2526 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1901, - "step": 2527 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0025, - "step": 2528 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0252, - "step": 2529 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8165, - "step": 2530 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6864, - "step": 2531 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1174, - "step": 2532 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.46, - "step": 2533 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3714, - "step": 2534 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1206, - "step": 2535 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3734, - "step": 2536 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7229, - "step": 2537 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0124, - "step": 2538 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2105, - "step": 2539 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1127, - "step": 2540 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1163, - "step": 2541 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5255, - "step": 2542 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2058, - "step": 2543 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7425, - "step": 2544 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3621, - "step": 2545 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7541, - "step": 2546 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9611, - "step": 2547 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3031, - "step": 2548 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1756, - "step": 2549 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6146, - "step": 2550 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1434, - "step": 2551 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0786, - "step": 2552 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9498, - "step": 2553 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8681, - "step": 2554 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5898, - "step": 2555 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7522, - "step": 2556 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3648, - "step": 2557 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8895, - "step": 2558 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9903, - "step": 2559 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1113, - "step": 2560 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6541, - "step": 2561 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8563, - "step": 2562 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0685, - "step": 2563 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.59, - "step": 2564 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0927, - "step": 2565 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3792, - "step": 2566 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.208, - "step": 2567 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9275, - "step": 2568 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.25, - "step": 2569 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9524, - "step": 2570 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.556, - "step": 2571 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6414, - "step": 2572 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1749, - "step": 2573 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4529, - "step": 2574 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9764, - "step": 2575 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1835, - "step": 2576 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.438, - "step": 2577 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.834, - "step": 2578 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8879, - "step": 2579 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1302, - "step": 2580 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8966, - "step": 2581 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7319, - "step": 2582 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3287, - "step": 2583 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3322, - "step": 2584 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0278, - "step": 2585 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5355, - "step": 2586 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2016, - "step": 2587 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8335, - "step": 2588 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.721, - "step": 2589 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4628, - "step": 2590 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7896, - "step": 2591 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7226, - "step": 2592 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5834, - "step": 2593 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8944, - "step": 2594 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1519, - "step": 2595 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2173, - "step": 2596 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9751, - "step": 2597 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1408, - "step": 2598 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2469, - "step": 2599 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3136, - "step": 2600 - }, - { - "epoch": 0.02, - "eval_loss": 6.580307483673096, - "eval_runtime": 22.5866, - "eval_samples_per_second": 2.214, - "eval_steps_per_second": 1.107, - "step": 2600 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.17715097402597402, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.21428571428571427, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.0, - "mmlu_loss": 3.684196367263794, - "step": 2600 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4702, - "step": 2601 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2103, - "step": 2602 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1688, - "step": 2603 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0882, - "step": 2604 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4417, - "step": 2605 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4982, - "step": 2606 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3721, - "step": 2607 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5558, - "step": 2608 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.251, - "step": 2609 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5517, - "step": 2610 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5841, - "step": 2611 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3849, - "step": 2612 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5556, - "step": 2613 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4158, - "step": 2614 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9386, - "step": 2615 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6376, - "step": 2616 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7976, - "step": 2617 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.291, - "step": 2618 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8779, - "step": 2619 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8159, - "step": 2620 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1198, - "step": 2621 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9783, - "step": 2622 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0635, - "step": 2623 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8539, - "step": 2624 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5904, - "step": 2625 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7561, - "step": 2626 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3628, - "step": 2627 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.2452, - "step": 2628 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8438, - "step": 2629 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7726, - "step": 2630 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.8356, - "step": 2631 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6763, - "step": 2632 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9104, - "step": 2633 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1199, - "step": 2634 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4833, - "step": 2635 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6164, - "step": 2636 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2284, - "step": 2637 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8304, - "step": 2638 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7706, - "step": 2639 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.638, - "step": 2640 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9576, - "step": 2641 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0828, - "step": 2642 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5206, - "step": 2643 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7744, - "step": 2644 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5361, - "step": 2645 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9041, - "step": 2646 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6145, - "step": 2647 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9121, - "step": 2648 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1322, - "step": 2649 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.1881, - "step": 2650 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6446, - "step": 2651 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9137, - "step": 2652 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4681, - "step": 2653 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9891, - "step": 2654 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3255, - "step": 2655 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3909, - "step": 2656 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6869, - "step": 2657 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0611, - "step": 2658 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3314, - "step": 2659 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6415, - "step": 2660 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5881, - "step": 2661 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8889, - "step": 2662 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3336, - "step": 2663 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1282, - "step": 2664 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.158, - "step": 2665 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1773, - "step": 2666 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9617, - "step": 2667 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5576, - "step": 2668 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8284, - "step": 2669 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5936, - "step": 2670 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0931, - "step": 2671 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.07, - "step": 2672 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.778, - "step": 2673 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7786, - "step": 2674 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1279, - "step": 2675 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.463, - "step": 2676 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2192, - "step": 2677 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4147, - "step": 2678 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9957, - "step": 2679 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8919, - "step": 2680 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1558, - "step": 2681 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7069, - "step": 2682 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.487, - "step": 2683 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7098, - "step": 2684 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1032, - "step": 2685 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9937, - "step": 2686 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.0677, - "step": 2687 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.644, - "step": 2688 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5099, - "step": 2689 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6922, - "step": 2690 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7491, - "step": 2691 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.605, - "step": 2692 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1904, - "step": 2693 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9175, - "step": 2694 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3251, - "step": 2695 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.315, - "step": 2696 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3052, - "step": 2697 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2122, - "step": 2698 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9462, - "step": 2699 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3221, - "step": 2700 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3125, - "step": 2701 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.938, - "step": 2702 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0983, - "step": 2703 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8028, - "step": 2704 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4752, - "step": 2705 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.16, - "step": 2706 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2429, - "step": 2707 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.623, - "step": 2708 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9595, - "step": 2709 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5444, - "step": 2710 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.6245, - "step": 2711 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.919, - "step": 2712 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7332, - "step": 2713 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0067, - "step": 2714 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6957, - "step": 2715 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.994, - "step": 2716 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7562, - "step": 2717 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6882, - "step": 2718 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8732, - "step": 2719 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6496, - "step": 2720 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4841, - "step": 2721 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4927, - "step": 2722 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7437, - "step": 2723 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9469, - "step": 2724 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1481, - "step": 2725 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7762, - "step": 2726 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8806, - "step": 2727 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8352, - "step": 2728 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9488, - "step": 2729 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1679, - "step": 2730 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2412, - "step": 2731 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6839, - "step": 2732 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7269, - "step": 2733 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6139, - "step": 2734 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8754, - "step": 2735 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9007, - "step": 2736 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9134, - "step": 2737 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9469, - "step": 2738 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9293, - "step": 2739 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0489, - "step": 2740 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4197, - "step": 2741 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.3667, - "step": 2742 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8059, - "step": 2743 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.024, - "step": 2744 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0756, - "step": 2745 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0596, - "step": 2746 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1416, - "step": 2747 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1271, - "step": 2748 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1452, - "step": 2749 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9527, - "step": 2750 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9189, - "step": 2751 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4364, - "step": 2752 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4173, - "step": 2753 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4034, - "step": 2754 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6678, - "step": 2755 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1092, - "step": 2756 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7751, - "step": 2757 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0164, - "step": 2758 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5796, - "step": 2759 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7851, - "step": 2760 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1784, - "step": 2761 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7417, - "step": 2762 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4268, - "step": 2763 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6919, - "step": 2764 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1838, - "step": 2765 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5592, - "step": 2766 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.425, - "step": 2767 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.848, - "step": 2768 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5683, - "step": 2769 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0262, - "step": 2770 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8063, - "step": 2771 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6139, - "step": 2772 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3478, - "step": 2773 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1811, - "step": 2774 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4519, - "step": 2775 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0071, - "step": 2776 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7872, - "step": 2777 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2263, - "step": 2778 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8923, - "step": 2779 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2997, - "step": 2780 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6857, - "step": 2781 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8874, - "step": 2782 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8203, - "step": 2783 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9583, - "step": 2784 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0814, - "step": 2785 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.419, - "step": 2786 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3063, - "step": 2787 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1285, - "step": 2788 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0039, - "step": 2789 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.671, - "step": 2790 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5452, - "step": 2791 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3885, - "step": 2792 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6282, - "step": 2793 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5913, - "step": 2794 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6189, - "step": 2795 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2968, - "step": 2796 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2825, - "step": 2797 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9897, - "step": 2798 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8193, - "step": 2799 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7042, - "step": 2800 - }, - { - "epoch": 0.02, - "eval_loss": 6.604581832885742, - "eval_runtime": 22.516, - "eval_samples_per_second": 2.221, - "eval_steps_per_second": 1.11, - "step": 2800 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 4.006761569976806, - "step": 2800 - }, - { - "epoch": 0.02, - "step": 2800, - "total_flos": 4.660001608148582e+16, - "train_loss": 6.312225336258395, - "train_runtime": 7855.0688, - "train_samples_per_second": 3.819, - "train_steps_per_second": 3.819 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0757, - "step": 2801 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8944, - "step": 2802 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8365, - "step": 2803 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.873, - "step": 2804 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3793, - "step": 2805 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1923, - "step": 2806 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2306, - "step": 2807 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4439, - "step": 2808 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3377, - "step": 2809 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8737, - "step": 2810 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4191, - "step": 2811 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.892, - "step": 2812 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4568, - "step": 2813 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0652, - "step": 2814 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6912, - "step": 2815 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9033, - "step": 2816 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4433, - "step": 2817 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7591, - "step": 2818 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4458, - "step": 2819 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3721, - "step": 2820 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4111, - "step": 2821 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0999, - "step": 2822 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5154, - "step": 2823 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1967, - "step": 2824 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8437, - "step": 2825 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.759, - "step": 2826 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6223, - "step": 2827 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3442, - "step": 2828 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1991, - "step": 2829 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5302, - "step": 2830 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1725, - "step": 2831 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8107, - "step": 2832 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7891, - "step": 2833 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5592, - "step": 2834 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8792, - "step": 2835 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2387, - "step": 2836 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9369, - "step": 2837 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2098, - "step": 2838 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6645, - "step": 2839 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2349, - "step": 2840 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8613, - "step": 2841 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5482, - "step": 2842 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5438, - "step": 2843 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6133, - "step": 2844 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9983, - "step": 2845 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8706, - "step": 2846 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9685, - "step": 2847 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.04, - "step": 2848 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6498, - "step": 2849 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6786, - "step": 2850 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.789, - "step": 2851 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 10.1116, - "step": 2852 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7085, - "step": 2853 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1083, - "step": 2854 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0795, - "step": 2855 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8677, - "step": 2856 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1663, - "step": 2857 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5379, - "step": 2858 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4923, - "step": 2859 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1456, - "step": 2860 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1686, - "step": 2861 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4153, - "step": 2862 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.17, - "step": 2863 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3301, - "step": 2864 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7106, - "step": 2865 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.463, - "step": 2866 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.614, - "step": 2867 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1451, - "step": 2868 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6817, - "step": 2869 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9577, - "step": 2870 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6171, - "step": 2871 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5129, - "step": 2872 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3386, - "step": 2873 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1176, - "step": 2874 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9075, - "step": 2875 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.667, - "step": 2876 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.8097, - "step": 2877 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7707, - "step": 2878 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7571, - "step": 2879 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0732, - "step": 2880 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5586, - "step": 2881 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8045, - "step": 2882 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4442, - "step": 2883 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.418, - "step": 2884 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7253, - "step": 2885 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4227, - "step": 2886 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9878, - "step": 2887 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8351, - "step": 2888 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1715, - "step": 2889 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1207, - "step": 2890 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0396, - "step": 2891 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7162, - "step": 2892 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2566, - "step": 2893 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4087, - "step": 2894 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4509, - "step": 2895 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8165, - "step": 2896 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9842, - "step": 2897 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.574, - "step": 2898 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4512, - "step": 2899 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9904, - "step": 2900 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6141, - "step": 2901 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9528, - "step": 2902 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9028, - "step": 2903 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3868, - "step": 2904 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0461, - "step": 2905 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5161, - "step": 2906 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.107, - "step": 2907 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7174, - "step": 2908 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7671, - "step": 2909 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6803, - "step": 2910 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5357, - "step": 2911 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6898, - "step": 2912 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8564, - "step": 2913 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1457, - "step": 2914 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3408, - "step": 2915 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6095, - "step": 2916 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.866, - "step": 2917 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7324, - "step": 2918 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4026, - "step": 2919 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1467, - "step": 2920 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2496, - "step": 2921 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5265, - "step": 2922 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8246, - "step": 2923 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5775, - "step": 2924 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2886, - "step": 2925 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3076, - "step": 2926 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7975, - "step": 2927 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9898, - "step": 2928 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7256, - "step": 2929 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7971, - "step": 2930 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5884, - "step": 2931 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0749, - "step": 2932 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6183, - "step": 2933 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0019, - "step": 2934 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1706, - "step": 2935 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4075, - "step": 2936 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4092, - "step": 2937 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9424, - "step": 2938 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9025, - "step": 2939 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7751, - "step": 2940 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.988, - "step": 2941 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1208, - "step": 2942 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1999, - "step": 2943 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2129, - "step": 2944 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4431, - "step": 2945 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1979, - "step": 2946 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8246, - "step": 2947 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4876, - "step": 2948 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.7158, - "step": 2949 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3611, - "step": 2950 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9919, - "step": 2951 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4136, - "step": 2952 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.801, - "step": 2953 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6503, - "step": 2954 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.553, - "step": 2955 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3536, - "step": 2956 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8516, - "step": 2957 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.9344, - "step": 2958 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8727, - "step": 2959 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9155, - "step": 2960 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9684, - "step": 2961 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0399, - "step": 2962 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.4298, - "step": 2963 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4559, - "step": 2964 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0361, - "step": 2965 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0081, - "step": 2966 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6698, - "step": 2967 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.3355, - "step": 2968 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.7555, - "step": 2969 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.29, - "step": 2970 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4079, - "step": 2971 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0588, - "step": 2972 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2956, - "step": 2973 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7318, - "step": 2974 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8846, - "step": 2975 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5159, - "step": 2976 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7629, - "step": 2977 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2039, - "step": 2978 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.311, - "step": 2979 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9786, - "step": 2980 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7138, - "step": 2981 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4827, - "step": 2982 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5264, - "step": 2983 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8153, - "step": 2984 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3512, - "step": 2985 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1515, - "step": 2986 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1689, - "step": 2987 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8027, - "step": 2988 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7472, - "step": 2989 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0236, - "step": 2990 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1741, - "step": 2991 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8884, - "step": 2992 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3912, - "step": 2993 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2109, - "step": 2994 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1693, - "step": 2995 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8166, - "step": 2996 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4902, - "step": 2997 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3921, - "step": 2998 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8787, - "step": 2999 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1304, - "step": 3000 - }, - { - "epoch": 0.02, - "eval_loss": 6.659167289733887, - "eval_runtime": 22.4512, - "eval_samples_per_second": 2.227, - "eval_steps_per_second": 1.114, - "step": 3000 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.973116703033447, - "step": 3000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4366, - "step": 3001 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1239, - "step": 3002 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.419, - "step": 3003 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7662, - "step": 3004 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1246, - "step": 3005 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3498, - "step": 3006 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1245, - "step": 3007 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6488, - "step": 3008 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3641, - "step": 3009 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7842, - "step": 3010 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.802, - "step": 3011 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1015, - "step": 3012 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9015, - "step": 3013 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8768, - "step": 3014 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7296, - "step": 3015 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4175, - "step": 3016 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3568, - "step": 3017 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5869, - "step": 3018 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5304, - "step": 3019 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1129, - "step": 3020 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8128, - "step": 3021 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1541, - "step": 3022 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3002, - "step": 3023 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0616, - "step": 3024 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3152, - "step": 3025 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4433, - "step": 3026 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8406, - "step": 3027 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2653, - "step": 3028 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7041, - "step": 3029 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3463, - "step": 3030 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7164, - "step": 3031 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9969, - "step": 3032 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1306, - "step": 3033 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0204, - "step": 3034 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6743, - "step": 3035 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3186, - "step": 3036 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5673, - "step": 3037 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1416, - "step": 3038 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1956, - "step": 3039 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6836, - "step": 3040 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0172, - "step": 3041 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.796, - "step": 3042 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6945, - "step": 3043 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.5079, - "step": 3044 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.399, - "step": 3045 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0071, - "step": 3046 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.4171, - "step": 3047 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0925, - "step": 3048 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.6842, - "step": 3049 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2442, - "step": 3050 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8023, - "step": 3051 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7147, - "step": 3052 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9681, - "step": 3053 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1882, - "step": 3054 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9869, - "step": 3055 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0705, - "step": 3056 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8802, - "step": 3057 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8399, - "step": 3058 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6438, - "step": 3059 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0608, - "step": 3060 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.763, - "step": 3061 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.355, - "step": 3062 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5504, - "step": 3063 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1582, - "step": 3064 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1922, - "step": 3065 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0706, - "step": 3066 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.807, - "step": 3067 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0305, - "step": 3068 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0774, - "step": 3069 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4469, - "step": 3070 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1158, - "step": 3071 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8087, - "step": 3072 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5672, - "step": 3073 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5864, - "step": 3074 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7665, - "step": 3075 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2434, - "step": 3076 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3988, - "step": 3077 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0307, - "step": 3078 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6561, - "step": 3079 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8158, - "step": 3080 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8384, - "step": 3081 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5515, - "step": 3082 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8108, - "step": 3083 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2536, - "step": 3084 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2506, - "step": 3085 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1605, - "step": 3086 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4572, - "step": 3087 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3312, - "step": 3088 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1481, - "step": 3089 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.3304, - "step": 3090 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2617, - "step": 3091 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3148, - "step": 3092 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4781, - "step": 3093 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.327, - "step": 3094 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3288, - "step": 3095 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2354, - "step": 3096 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4802, - "step": 3097 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1659, - "step": 3098 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9887, - "step": 3099 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 1.9497, - "step": 3100 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2668, - "step": 3101 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.458, - "step": 3102 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.9919, - "step": 3103 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0408, - "step": 3104 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9966, - "step": 3105 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.1371, - "step": 3106 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0939, - "step": 3107 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2532, - "step": 3108 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7343, - "step": 3109 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.156, - "step": 3110 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2223, - "step": 3111 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6182, - "step": 3112 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4423, - "step": 3113 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3855, - "step": 3114 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.2115, - "step": 3115 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6719, - "step": 3116 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5285, - "step": 3117 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0171, - "step": 3118 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2127, - "step": 3119 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8153, - "step": 3120 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1838, - "step": 3121 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.711, - "step": 3122 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1568, - "step": 3123 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3741, - "step": 3124 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2741, - "step": 3125 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.1653, - "step": 3126 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.9722, - "step": 3127 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9181, - "step": 3128 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.384, - "step": 3129 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.1491, - "step": 3130 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8641, - "step": 3131 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6125, - "step": 3132 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1702, - "step": 3133 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.4853, - "step": 3134 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7929, - "step": 3135 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8747, - "step": 3136 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.2659, - "step": 3137 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.0685, - "step": 3138 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2297, - "step": 3139 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0715, - "step": 3140 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2201, - "step": 3141 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2134, - "step": 3142 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6098, - "step": 3143 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.2036, - "step": 3144 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2481, - "step": 3145 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4508, - "step": 3146 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.1454, - "step": 3147 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7573, - "step": 3148 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2946, - "step": 3149 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0121, - "step": 3150 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.498, - "step": 3151 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4016, - "step": 3152 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5934, - "step": 3153 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.737, - "step": 3154 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9874, - "step": 3155 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7714, - "step": 3156 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3529, - "step": 3157 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7673, - "step": 3158 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3835, - "step": 3159 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.0336, - "step": 3160 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2998, - "step": 3161 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0446, - "step": 3162 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5225, - "step": 3163 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1391, - "step": 3164 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7669, - "step": 3165 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.118, - "step": 3166 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7045, - "step": 3167 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.077, - "step": 3168 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0697, - "step": 3169 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8445, - "step": 3170 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4044, - "step": 3171 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9817, - "step": 3172 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.2508, - "step": 3173 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6302, - "step": 3174 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.46, - "step": 3175 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.8094, - "step": 3176 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1174, - "step": 3177 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5756, - "step": 3178 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4215, - "step": 3179 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.5957, - "step": 3180 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3534, - "step": 3181 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9342, - "step": 3182 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.8227, - "step": 3183 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.1404, - "step": 3184 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9897, - "step": 3185 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7286, - "step": 3186 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.5414, - "step": 3187 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7523, - "step": 3188 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4033, - "step": 3189 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8593, - "step": 3190 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6095, - "step": 3191 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.504, - "step": 3192 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.6776, - "step": 3193 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0174, - "step": 3194 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7759, - "step": 3195 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5685, - "step": 3196 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2786, - "step": 3197 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.7794, - "step": 3198 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5827, - "step": 3199 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3282, - "step": 3200 - }, - { - "epoch": 0.02, - "eval_loss": 6.423073768615723, - "eval_runtime": 22.4644, - "eval_samples_per_second": 2.226, - "eval_steps_per_second": 1.113, - "step": 3200 - }, - { - "epoch": 0.02, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.9956862831115725, - "step": 3200 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3005, - "step": 3201 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0131, - "step": 3202 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.9222, - "step": 3203 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6755, - "step": 3204 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.8386, - "step": 3205 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7854, - "step": 3206 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4073, - "step": 3207 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.4278, - "step": 3208 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5322, - "step": 3209 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.1508, - "step": 3210 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4289, - "step": 3211 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 9.0088, - "step": 3212 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6227, - "step": 3213 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.232, - "step": 3214 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4584, - "step": 3215 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8538, - "step": 3216 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.1044, - "step": 3217 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3851, - "step": 3218 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7661, - "step": 3219 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.4478, - "step": 3220 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0658, - "step": 3221 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.0961, - "step": 3222 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.5636, - "step": 3223 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6945, - "step": 3224 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.5381, - "step": 3225 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2431, - "step": 3226 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3755, - "step": 3227 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.4123, - "step": 3228 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.0414, - "step": 3229 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6732, - "step": 3230 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8904, - "step": 3231 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7829, - "step": 3232 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8394, - "step": 3233 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.2565, - "step": 3234 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.7535, - "step": 3235 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.8878, - "step": 3236 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9086, - "step": 3237 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7491, - "step": 3238 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.0775, - "step": 3239 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.3575, - "step": 3240 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.7583, - "step": 3241 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.7107, - "step": 3242 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.3212, - "step": 3243 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.5662, - "step": 3244 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9553, - "step": 3245 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.3713, - "step": 3246 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4519, - "step": 3247 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.0003, - "step": 3248 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.2796, - "step": 3249 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.3262, - "step": 3250 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.0106, - "step": 3251 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.8502, - "step": 3252 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.3491, - "step": 3253 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 2.2728, - "step": 3254 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.822, - "step": 3255 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.6077, - "step": 3256 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9328, - "step": 3257 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.8518, - "step": 3258 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.6541, - "step": 3259 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.6976, - "step": 3260 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 8.9285, - "step": 3261 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.9624, - "step": 3262 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6754, - "step": 3263 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.4891, - "step": 3264 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.7557, - "step": 3265 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9059, - "step": 3266 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 6.2432, - "step": 3267 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 5.9467, - "step": 3268 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 7.6248, - "step": 3269 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 3.6632, - "step": 3270 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.5671, - "step": 3271 - }, - { - "epoch": 0.02, - "learning_rate": 0.0004, - "loss": 4.9754, - "step": 3272 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.2562, - "step": 3273 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.6304, - "step": 3274 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.2176, - "step": 3275 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.0867, - "step": 3276 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.6769, - "step": 3277 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.1474, - "step": 3278 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.5615, - "step": 3279 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.3285, - "step": 3280 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.2536, - "step": 3281 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.1851, - "step": 3282 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.1866, - "step": 3283 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.26, - "step": 3284 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.1826, - "step": 3285 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.3643, - "step": 3286 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.5651, - "step": 3287 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.7203, - "step": 3288 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.7083, - "step": 3289 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.1364, - "step": 3290 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.1367, - "step": 3291 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.5968, - "step": 3292 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.287, - "step": 3293 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.5756, - "step": 3294 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.3841, - "step": 3295 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.3323, - "step": 3296 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.0486, - "step": 3297 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.6838, - "step": 3298 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.5777, - "step": 3299 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.4593, - "step": 3300 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.9308, - "step": 3301 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.6334, - "step": 3302 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.6559, - "step": 3303 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.7031, - "step": 3304 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 9.2146, - "step": 3305 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.6907, - "step": 3306 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.8058, - "step": 3307 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.7805, - "step": 3308 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.557, - "step": 3309 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.2331, - "step": 3310 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.1702, - "step": 3311 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.2958, - "step": 3312 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.6121, - "step": 3313 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.9686, - "step": 3314 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.0866, - "step": 3315 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.2124, - "step": 3316 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.9969, - "step": 3317 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.3417, - "step": 3318 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.7311, - "step": 3319 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.8008, - "step": 3320 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.2469, - "step": 3321 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.4888, - "step": 3322 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.2527, - "step": 3323 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.7326, - "step": 3324 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.4268, - "step": 3325 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.3635, - "step": 3326 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.8848, - "step": 3327 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.2306, - "step": 3328 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.4185, - "step": 3329 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.0911, - "step": 3330 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.0277, - "step": 3331 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.4066, - "step": 3332 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.9085, - "step": 3333 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.2858, - "step": 3334 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.4789, - "step": 3335 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.8016, - "step": 3336 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.9443, - "step": 3337 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.8456, - "step": 3338 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.0709, - "step": 3339 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.7879, - "step": 3340 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.6286, - "step": 3341 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.7528, - "step": 3342 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.4875, - "step": 3343 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.4383, - "step": 3344 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.7478, - "step": 3345 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.8581, - "step": 3346 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.4681, - "step": 3347 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.179, - "step": 3348 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.772, - "step": 3349 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.5266, - "step": 3350 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.962, - "step": 3351 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.1986, - "step": 3352 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.8946, - "step": 3353 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.8563, - "step": 3354 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.0887, - "step": 3355 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.9487, - "step": 3356 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.9164, - "step": 3357 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.6243, - "step": 3358 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.1646, - "step": 3359 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.3753, - "step": 3360 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.6102, - "step": 3361 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.8946, - "step": 3362 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.1349, - "step": 3363 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.7417, - "step": 3364 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.4195, - "step": 3365 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.0547, - "step": 3366 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.3341, - "step": 3367 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.4345, - "step": 3368 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 9.3088, - "step": 3369 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.7109, - "step": 3370 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.6865, - "step": 3371 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.5466, - "step": 3372 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.4801, - "step": 3373 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.7442, - "step": 3374 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.0129, - "step": 3375 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.2859, - "step": 3376 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.0064, - "step": 3377 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.1277, - "step": 3378 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.9071, - "step": 3379 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.3575, - "step": 3380 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.7016, - "step": 3381 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.151, - "step": 3382 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 8.4008, - "step": 3383 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.3565, - "step": 3384 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.3863, - "step": 3385 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.7621, - "step": 3386 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.3799, - "step": 3387 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.8051, - "step": 3388 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.1864, - "step": 3389 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.4464, - "step": 3390 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.649, - "step": 3391 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.7819, - "step": 3392 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 6.4867, - "step": 3393 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.0732, - "step": 3394 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.2601, - "step": 3395 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 4.8808, - "step": 3396 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 2.9649, - "step": 3397 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 7.3081, - "step": 3398 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 3.7762, - "step": 3399 - }, - { - "epoch": 0.03, - "learning_rate": 0.0004, - "loss": 5.0978, - "step": 3400 - }, - { - "epoch": 0.03, - "eval_loss": 6.543734073638916, - "eval_runtime": 22.8771, - "eval_samples_per_second": 2.186, - "eval_steps_per_second": 1.093, - "step": 3400 - }, - { - "epoch": 0.03, - "mmlu_eval_accuracy": 0.2525477994227994, - "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, - "mmlu_eval_accuracy_anatomy": 0.07142857142857142, - "mmlu_eval_accuracy_astronomy": 0.3125, - "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, - "mmlu_loss": 3.872052774429321, - "step": 3400 - } - ], - "max_steps": 30000, - "num_train_epochs": 1, - "total_flos": 5.56834317728809e+16, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-3400/training_args.bin b/checkpoint-3400/training_args.bin deleted file mode 100644 index 53a16291359ea01b885cc36189679e385fee54a8..0000000000000000000000000000000000000000 --- a/checkpoint-3400/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2f399ab69470e06aaa321f2990a85c1505da75b9e960c095081ae355addfd1d -size 6011 diff --git a/checkpoint-3600/README.md b/checkpoint-3600/README.md deleted file mode 100644 index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000 --- a/checkpoint-3600/README.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -library_name: peft ---- -## Training procedure - - -The following `bitsandbytes` quantization config was used during training: -- load_in_8bit: False -- load_in_4bit: True -- llm_int8_threshold: 6.0 -- llm_int8_skip_modules: None -- llm_int8_enable_fp32_cpu_offload: False -- llm_int8_has_fp16_weight: False -- bnb_4bit_quant_type: nf4 -- bnb_4bit_use_double_quant: True -- bnb_4bit_compute_dtype: bfloat16 -### Framework versions - - -- PEFT 0.4.0 diff --git a/checkpoint-3600/adapter_config.json b/checkpoint-3600/adapter_config.json deleted file mode 100644 index 2adcd7d22e9c842efe5230fdbfc7ae7a84aededb..0000000000000000000000000000000000000000 --- a/checkpoint-3600/adapter_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "auto_mapping": null, - "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layers_pattern": null, - "layers_to_transform": null, - "lora_alpha": 16.0, - "lora_dropout": 0.1, - "modules_to_save": null, - "peft_type": "LORA", - "r": 64, - "revision": null, - "target_modules": [ - "q_proj", - "o_proj", - "k_proj", - "gate_proj", - "down_proj", - "v_proj", - "up_proj" - ], - "task_type": "CAUSAL_LM" -} \ No newline at end of file diff --git a/checkpoint-3600/adapter_model.bin b/checkpoint-3600/adapter_model.bin deleted file mode 100644 index 14b9867f2bf987d46258aae84b69552f6eda3b9d..0000000000000000000000000000000000000000 --- a/checkpoint-3600/adapter_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b34e3b3576f3720f7185c1ccb0d2d9c0804e57ec84709194b281ba48ab04b1e2 -size 871609293 diff --git a/checkpoint-3600/added_tokens.json b/checkpoint-3600/added_tokens.json deleted file mode 100644 index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000 --- a/checkpoint-3600/added_tokens.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "[PAD]": 32000 -} diff --git a/checkpoint-3600/optimizer.pt b/checkpoint-3600/optimizer.pt deleted file mode 100644 index 2362b4ae0a141013419384f7cc6e35b32cfecc97..0000000000000000000000000000000000000000 --- a/checkpoint-3600/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98b87c063236b5fc1691619bd78ec6295b670c63cff17671beeb11eeba7e8c2a -size 873873439 diff --git a/checkpoint-3600/rng_state.pth b/checkpoint-3600/rng_state.pth deleted file mode 100644 index 635916c5cebf8cb5739310eb4296325354c55755..0000000000000000000000000000000000000000 --- a/checkpoint-3600/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e40a0810d44b7cb7127ecc61949ff92fd7be131f4d5dee65efe42c196a85e9c -size 14511 diff --git a/checkpoint-3600/scheduler.pt b/checkpoint-3600/scheduler.pt deleted file mode 100644 index 3928d63d5c99298229914f0696159f4c207e9e91..0000000000000000000000000000000000000000 --- a/checkpoint-3600/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f65bc6eebca3baf3a9e642997ebf957326052a11f3e513f00b005285909a6c9b -size 627 diff --git a/checkpoint-3600/special_tokens_map.json b/checkpoint-3600/special_tokens_map.json deleted file mode 100644 index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000 --- a/checkpoint-3600/special_tokens_map.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "bos_token": "", - "eos_token": "", - "pad_token": "[PAD]", - "unk_token": "" -} diff --git a/checkpoint-3600/tokenizer.model b/checkpoint-3600/tokenizer.model deleted file mode 100644 index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000 --- a/checkpoint-3600/tokenizer.model +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 -size 499723 diff --git a/checkpoint-3600/tokenizer_config.json b/checkpoint-3600/tokenizer_config.json deleted file mode 100644 index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000 --- a/checkpoint-3600/tokenizer_config.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "clean_up_tokenization_spaces": false, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - }, - "legacy": null, - "model_max_length": 1000000000000000019884624838656, - "pad_token": null, - "padding_side": "right", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizer", - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false - } -} diff --git a/checkpoint-3600/training_args.bin b/checkpoint-3600/training_args.bin deleted file mode 100644 index 53a16291359ea01b885cc36189679e385fee54a8..0000000000000000000000000000000000000000 --- a/checkpoint-3600/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2f399ab69470e06aaa321f2990a85c1505da75b9e960c095081ae355addfd1d -size 6011 diff --git a/checkpoint-4200/adapter_model/adapter_model/README.md b/checkpoint-4200/adapter_model/adapter_model/README.md index b2a9ac08c477a18d16ef75ee89b21cee91a6169a..695fd1bbc4fbae30f9d5e97a03a533a5ce88ee48 100644 --- a/checkpoint-4200/adapter_model/adapter_model/README.md +++ b/checkpoint-4200/adapter_model/adapter_model/README.md @@ -26,6 +26,28 @@ The following `bitsandbytes` quantization config was used during training: - bnb_4bit_use_double_quant: True - bnb_4bit_compute_dtype: bfloat16 +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 + +The following `bitsandbytes` quantization config was used during training: +- load_in_8bit: False +- load_in_4bit: True +- llm_int8_threshold: 6.0 +- llm_int8_skip_modules: None +- llm_int8_enable_fp32_cpu_offload: False +- llm_int8_has_fp16_weight: False +- bnb_4bit_quant_type: nf4 +- bnb_4bit_use_double_quant: True +- bnb_4bit_compute_dtype: bfloat16 + The following `bitsandbytes` quantization config was used during training: - load_in_8bit: False - load_in_4bit: True @@ -38,6 +60,8 @@ The following `bitsandbytes` quantization config was used during training: - bnb_4bit_compute_dtype: bfloat16 ### Framework versions +- PEFT 0.4.0 +- PEFT 0.4.0 - PEFT 0.4.0 - PEFT 0.4.0 diff --git a/checkpoint-4200/adapter_model/adapter_model/adapter_model.bin b/checkpoint-4200/adapter_model/adapter_model/adapter_model.bin index 75403d8c4e929d311e7bce5cf774d2098a629ddb..fc016efdf9506d0259f547141bf761264d0ab211 100644 --- a/checkpoint-4200/adapter_model/adapter_model/adapter_model.bin +++ b/checkpoint-4200/adapter_model/adapter_model/adapter_model.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3000da4279d12ca434e479144b3d896ae113abf8f38241e4c9e34173acb30214 +oid sha256:ff18c40f9b3c9fb20f1c95d4dff151244eba09eee79ae11c6121cc23181c2442 size 871609293 diff --git a/checkpoint-2000/README.md b/checkpoint-5000/README.md similarity index 100% rename from checkpoint-2000/README.md rename to checkpoint-5000/README.md diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-5000/adapter_config.json similarity index 100% rename from checkpoint-2000/adapter_config.json rename to checkpoint-5000/adapter_config.json index a2f0ea437da66b2120cc72d92fb46f999dfb8535..092fee3278e3444f43369802f65b72c8a1e4b2b3 100644 --- a/checkpoint-2000/adapter_config.json +++ b/checkpoint-5000/adapter_config.json @@ -14,12 +14,12 @@ "r": 64, "revision": null, "target_modules": [ + "v_proj", "down_proj", - "up_proj", "q_proj", "gate_proj", "o_proj", - "v_proj", + "up_proj", "k_proj" ], "task_type": "CAUSAL_LM" diff --git a/checkpoint-5000/adapter_model.bin b/checkpoint-5000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d84c7171eb208fd34875365c634b8187ad2be92d --- /dev/null +++ b/checkpoint-5000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c7b7177ebf043527303efc179b203f643a96dcbc5d00d10d809a5b270f2b361 +size 871609293 diff --git a/checkpoint-2000/added_tokens.json b/checkpoint-5000/added_tokens.json similarity index 100% rename from checkpoint-2000/added_tokens.json rename to checkpoint-5000/added_tokens.json diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..38b20b768c78082b15b18c44aacd69e1ca27b6e7 --- /dev/null +++ b/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebc554610b11caf5633c717bfa7316e18989ef6c7946946d6160d9616a1d2fa +size 873872799 diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..47651251f956820d3d6b57187c6b807f1f264c3b --- /dev/null +++ b/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4b1d4515ff05e70ac8f64f10952b83cf5808631c69a1c78b39c9aa91ccdb123 +size 14511 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-5000/scheduler.pt similarity index 100% rename from checkpoint-3000/scheduler.pt rename to checkpoint-5000/scheduler.pt diff --git a/checkpoint-2000/special_tokens_map.json b/checkpoint-5000/special_tokens_map.json similarity index 100% rename from checkpoint-2000/special_tokens_map.json rename to checkpoint-5000/special_tokens_map.json diff --git a/checkpoint-2000/tokenizer.model b/checkpoint-5000/tokenizer.model similarity index 100% rename from checkpoint-2000/tokenizer.model rename to checkpoint-5000/tokenizer.model diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-5000/tokenizer_config.json similarity index 100% rename from checkpoint-2000/tokenizer_config.json rename to checkpoint-5000/tokenizer_config.json diff --git a/checkpoint-3600/trainer_state.json b/checkpoint-5000/trainer_state.json similarity index 71% rename from checkpoint-3600/trainer_state.json rename to checkpoint-5000/trainer_state.json index 02aff637daf9116fa4e529a8e2a4b3be39eccc0b..cae76cf9d9e9e6e6bf2d5ba5a5109a8110559bba 100644 --- a/checkpoint-3600/trainer_state.json +++ b/checkpoint-5000/trainer_state.json @@ -1,8 +1,8 @@ { - "best_metric": 6.423073768615723, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-3200", - "epoch": 0.027499809029103966, - "global_step": 3600, + "best_metric": 6.335043907165527, + "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-4200", + "epoch": 0.03819417920708884, + "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -21939,11 +21939,8555 @@ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, "mmlu_loss": 4.059268207550049, "step": 3600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.79, + "step": 3601 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9946, + "step": 3602 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5461, + "step": 3603 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5258, + "step": 3604 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7346, + "step": 3605 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3883, + "step": 3606 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5788, + "step": 3607 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2204, + "step": 3608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1632, + "step": 3609 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7381, + "step": 3610 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1473, + "step": 3611 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9152, + "step": 3612 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4622, + "step": 3613 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6216, + "step": 3614 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6317, + "step": 3615 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4409, + "step": 3616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0236, + "step": 3617 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8894, + "step": 3618 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0822, + "step": 3619 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.953, + "step": 3620 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1938, + "step": 3621 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3474, + "step": 3622 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4535, + "step": 3623 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3665, + "step": 3624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4139, + "step": 3625 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6508, + "step": 3626 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5838, + "step": 3627 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9303, + "step": 3628 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3722, + "step": 3629 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4449, + "step": 3630 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3978, + "step": 3631 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5792, + "step": 3632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4508, + "step": 3633 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5883, + "step": 3634 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1239, + "step": 3635 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2981, + "step": 3636 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7042, + "step": 3637 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1361, + "step": 3638 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9068, + "step": 3639 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3976, + "step": 3640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0685, + "step": 3641 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5028, + "step": 3642 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0945, + "step": 3643 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0476, + "step": 3644 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7249, + "step": 3645 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8524, + "step": 3646 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1725, + "step": 3647 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1051, + "step": 3648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7044, + "step": 3649 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0646, + "step": 3650 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5899, + "step": 3651 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2098, + "step": 3652 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2737, + "step": 3653 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7012, + "step": 3654 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.392, + "step": 3655 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1024, + "step": 3656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8084, + "step": 3657 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0021, + "step": 3658 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0005, + "step": 3659 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0075, + "step": 3660 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0208, + "step": 3661 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1425, + "step": 3662 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2731, + "step": 3663 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0704, + "step": 3664 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.7608, + "step": 3665 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.126, + "step": 3666 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1159, + "step": 3667 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.301, + "step": 3668 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7234, + "step": 3669 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9938, + "step": 3670 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1246, + "step": 3671 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2179, + "step": 3672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4044, + "step": 3673 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0196, + "step": 3674 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5111, + "step": 3675 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6913, + "step": 3676 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3374, + "step": 3677 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7363, + "step": 3678 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0937, + "step": 3679 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.691, + "step": 3680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1534, + "step": 3681 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2765, + "step": 3682 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.635, + "step": 3683 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4933, + "step": 3684 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3922, + "step": 3685 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2925, + "step": 3686 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.646, + "step": 3687 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2651, + "step": 3688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7927, + "step": 3689 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7202, + "step": 3690 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5883, + "step": 3691 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1781, + "step": 3692 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5795, + "step": 3693 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1326, + "step": 3694 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2378, + "step": 3695 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.462, + "step": 3696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7268, + "step": 3697 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9041, + "step": 3698 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8232, + "step": 3699 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 1.7176, + "step": 3700 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.854, + "step": 3701 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 1.6464, + "step": 3702 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.264, + "step": 3703 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8032, + "step": 3704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7097, + "step": 3705 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.068, + "step": 3706 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3674, + "step": 3707 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5067, + "step": 3708 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4841, + "step": 3709 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2839, + "step": 3710 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0894, + "step": 3711 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5614, + "step": 3712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7785, + "step": 3713 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8865, + "step": 3714 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0697, + "step": 3715 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0635, + "step": 3716 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0788, + "step": 3717 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8854, + "step": 3718 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3054, + "step": 3719 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7088, + "step": 3720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8188, + "step": 3721 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4379, + "step": 3722 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8049, + "step": 3723 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4344, + "step": 3724 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.647, + "step": 3725 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5758, + "step": 3726 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9208, + "step": 3727 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4727, + "step": 3728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2176, + "step": 3729 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0123, + "step": 3730 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.404, + "step": 3731 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0537, + "step": 3732 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8559, + "step": 3733 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2705, + "step": 3734 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4536, + "step": 3735 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5407, + "step": 3736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7118, + "step": 3737 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4877, + "step": 3738 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1927, + "step": 3739 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3252, + "step": 3740 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9464, + "step": 3741 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3133, + "step": 3742 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.427, + "step": 3743 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8029, + "step": 3744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8428, + "step": 3745 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3284, + "step": 3746 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2628, + "step": 3747 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.466, + "step": 3748 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3269, + "step": 3749 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1805, + "step": 3750 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5579, + "step": 3751 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.581, + "step": 3752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0164, + "step": 3753 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9736, + "step": 3754 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8441, + "step": 3755 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3202, + "step": 3756 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9395, + "step": 3757 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2659, + "step": 3758 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5779, + "step": 3759 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1052, + "step": 3760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0075, + "step": 3761 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5628, + "step": 3762 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7005, + "step": 3763 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.979, + "step": 3764 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9874, + "step": 3765 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8137, + "step": 3766 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1964, + "step": 3767 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9047, + "step": 3768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2724, + "step": 3769 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8014, + "step": 3770 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7738, + "step": 3771 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.926, + "step": 3772 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.94, + "step": 3773 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1657, + "step": 3774 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9987, + "step": 3775 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5263, + "step": 3776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0671, + "step": 3777 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1628, + "step": 3778 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4194, + "step": 3779 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5225, + "step": 3780 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3863, + "step": 3781 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7951, + "step": 3782 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5864, + "step": 3783 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.177, + "step": 3784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2334, + "step": 3785 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8444, + "step": 3786 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7108, + "step": 3787 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6521, + "step": 3788 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4604, + "step": 3789 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.12, + "step": 3790 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3275, + "step": 3791 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9743, + "step": 3792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6064, + "step": 3793 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6108, + "step": 3794 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0104, + "step": 3795 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2739, + "step": 3796 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8316, + "step": 3797 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5044, + "step": 3798 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.838, + "step": 3799 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5967, + "step": 3800 + }, + { + "epoch": 0.03, + "eval_loss": 6.464095592498779, + "eval_runtime": 22.5612, + "eval_samples_per_second": 2.216, + "eval_steps_per_second": 1.108, + "step": 3800 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.589159059524536, + "step": 3800 + }, + { + "epoch": 0.03, + "step": 3800, + "total_flos": 6.198225528943411e+16, + "train_loss": 1.5118552861401908, + "train_runtime": 2842.6735, + "train_samples_per_second": 10.553, + "train_steps_per_second": 10.553 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.59, + "step": 3801 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0344, + "step": 3802 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9693, + "step": 3803 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4471, + "step": 3804 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2325, + "step": 3805 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7491, + "step": 3806 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3688, + "step": 3807 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9074, + "step": 3808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.092, + "step": 3809 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3568, + "step": 3810 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5895, + "step": 3811 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9071, + "step": 3812 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4571, + "step": 3813 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8045, + "step": 3814 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1976, + "step": 3815 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.356, + "step": 3816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3433, + "step": 3817 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5517, + "step": 3818 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0198, + "step": 3819 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4643, + "step": 3820 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9985, + "step": 3821 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4659, + "step": 3822 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7302, + "step": 3823 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3538, + "step": 3824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3654, + "step": 3825 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9622, + "step": 3826 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5866, + "step": 3827 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7111, + "step": 3828 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4568, + "step": 3829 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9525, + "step": 3830 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1796, + "step": 3831 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8362, + "step": 3832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2532, + "step": 3833 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2513, + "step": 3834 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.244, + "step": 3835 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1098, + "step": 3836 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6361, + "step": 3837 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1826, + "step": 3838 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.157, + "step": 3839 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6614, + "step": 3840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.0458, + "step": 3841 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.022, + "step": 3842 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5359, + "step": 3843 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9234, + "step": 3844 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6395, + "step": 3845 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8419, + "step": 3846 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8434, + "step": 3847 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.207, + "step": 3848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3591, + "step": 3849 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8356, + "step": 3850 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9219, + "step": 3851 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3402, + "step": 3852 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8806, + "step": 3853 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5234, + "step": 3854 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7777, + "step": 3855 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1154, + "step": 3856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5921, + "step": 3857 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7223, + "step": 3858 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 10.4473, + "step": 3859 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4797, + "step": 3860 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1393, + "step": 3861 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1977, + "step": 3862 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5573, + "step": 3863 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8899, + "step": 3864 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7704, + "step": 3865 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6136, + "step": 3866 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7599, + "step": 3867 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0662, + "step": 3868 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0538, + "step": 3869 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.934, + "step": 3870 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8072, + "step": 3871 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3782, + "step": 3872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7663, + "step": 3873 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.486, + "step": 3874 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7332, + "step": 3875 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5108, + "step": 3876 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6332, + "step": 3877 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5284, + "step": 3878 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5711, + "step": 3879 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.9677, + "step": 3880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7378, + "step": 3881 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3763, + "step": 3882 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7173, + "step": 3883 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2495, + "step": 3884 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1252, + "step": 3885 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8202, + "step": 3886 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5098, + "step": 3887 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6304, + "step": 3888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6211, + "step": 3889 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6213, + "step": 3890 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6606, + "step": 3891 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0749, + "step": 3892 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8867, + "step": 3893 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6526, + "step": 3894 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7102, + "step": 3895 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2593, + "step": 3896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2783, + "step": 3897 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1808, + "step": 3898 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5008, + "step": 3899 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8244, + "step": 3900 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8664, + "step": 3901 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3084, + "step": 3902 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8537, + "step": 3903 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1877, + "step": 3904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6313, + "step": 3905 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2842, + "step": 3906 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6201, + "step": 3907 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6035, + "step": 3908 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.5118, + "step": 3909 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8019, + "step": 3910 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4406, + "step": 3911 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1105, + "step": 3912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3885, + "step": 3913 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5057, + "step": 3914 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8441, + "step": 3915 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5325, + "step": 3916 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9075, + "step": 3917 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9497, + "step": 3918 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4994, + "step": 3919 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3098, + "step": 3920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0306, + "step": 3921 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1548, + "step": 3922 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9829, + "step": 3923 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1177, + "step": 3924 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1212, + "step": 3925 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9675, + "step": 3926 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6422, + "step": 3927 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0774, + "step": 3928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5982, + "step": 3929 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9857, + "step": 3930 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1237, + "step": 3931 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7049, + "step": 3932 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1052, + "step": 3933 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.0896, + "step": 3934 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0104, + "step": 3935 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4442, + "step": 3936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1093, + "step": 3937 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5394, + "step": 3938 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0815, + "step": 3939 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1694, + "step": 3940 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6317, + "step": 3941 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6136, + "step": 3942 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7063, + "step": 3943 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9645, + "step": 3944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2328, + "step": 3945 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8863, + "step": 3946 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7473, + "step": 3947 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9884, + "step": 3948 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2287, + "step": 3949 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3459, + "step": 3950 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5786, + "step": 3951 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2403, + "step": 3952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0499, + "step": 3953 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1844, + "step": 3954 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6237, + "step": 3955 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.72, + "step": 3956 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3407, + "step": 3957 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0939, + "step": 3958 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5248, + "step": 3959 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3676, + "step": 3960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6449, + "step": 3961 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1033, + "step": 3962 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9184, + "step": 3963 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0001, + "step": 3964 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7925, + "step": 3965 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1872, + "step": 3966 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0878, + "step": 3967 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2376, + "step": 3968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6891, + "step": 3969 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9365, + "step": 3970 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9592, + "step": 3971 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.051, + "step": 3972 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5806, + "step": 3973 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6242, + "step": 3974 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3711, + "step": 3975 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3724, + "step": 3976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.681, + "step": 3977 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9895, + "step": 3978 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1728, + "step": 3979 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6712, + "step": 3980 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2322, + "step": 3981 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8414, + "step": 3982 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2529, + "step": 3983 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3793, + "step": 3984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4325, + "step": 3985 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5571, + "step": 3986 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8588, + "step": 3987 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0937, + "step": 3988 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4595, + "step": 3989 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2496, + "step": 3990 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0736, + "step": 3991 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1827, + "step": 3992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2944, + "step": 3993 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2309, + "step": 3994 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5547, + "step": 3995 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5263, + "step": 3996 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1416, + "step": 3997 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9477, + "step": 3998 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8041, + "step": 3999 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8932, + "step": 4000 + }, + { + "epoch": 0.03, + "eval_loss": 6.467012405395508, + "eval_runtime": 22.2494, + "eval_samples_per_second": 2.247, + "eval_steps_per_second": 1.124, + "step": 4000 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.5710299587249756, + "step": 4000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.07, + "step": 4001 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1685, + "step": 4002 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7579, + "step": 4003 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6209, + "step": 4004 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1189, + "step": 4005 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4346, + "step": 4006 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0016, + "step": 4007 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1685, + "step": 4008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3534, + "step": 4009 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7965, + "step": 4010 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4994, + "step": 4011 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1394, + "step": 4012 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3169, + "step": 4013 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.261, + "step": 4014 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8362, + "step": 4015 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8274, + "step": 4016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5416, + "step": 4017 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6363, + "step": 4018 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1583, + "step": 4019 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1575, + "step": 4020 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0904, + "step": 4021 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0278, + "step": 4022 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8278, + "step": 4023 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.535, + "step": 4024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3822, + "step": 4025 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0538, + "step": 4026 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2228, + "step": 4027 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.509, + "step": 4028 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4223, + "step": 4029 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1033, + "step": 4030 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1594, + "step": 4031 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3259, + "step": 4032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1647, + "step": 4033 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3303, + "step": 4034 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5812, + "step": 4035 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4741, + "step": 4036 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8335, + "step": 4037 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8835, + "step": 4038 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3226, + "step": 4039 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.3215, + "step": 4040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5474, + "step": 4041 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7139, + "step": 4042 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6725, + "step": 4043 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6821, + "step": 4044 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8876, + "step": 4045 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4142, + "step": 4046 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4157, + "step": 4047 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.441, + "step": 4048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8809, + "step": 4049 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8153, + "step": 4050 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1665, + "step": 4051 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5073, + "step": 4052 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4909, + "step": 4053 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9506, + "step": 4054 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0875, + "step": 4055 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3291, + "step": 4056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6951, + "step": 4057 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7277, + "step": 4058 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4189, + "step": 4059 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9827, + "step": 4060 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0741, + "step": 4061 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0725, + "step": 4062 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9464, + "step": 4063 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4358, + "step": 4064 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4715, + "step": 4065 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3786, + "step": 4066 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4934, + "step": 4067 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2759, + "step": 4068 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6035, + "step": 4069 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2499, + "step": 4070 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1888, + "step": 4071 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0456, + "step": 4072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.951, + "step": 4073 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4353, + "step": 4074 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7801, + "step": 4075 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7917, + "step": 4076 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3531, + "step": 4077 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6331, + "step": 4078 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5221, + "step": 4079 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1318, + "step": 4080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2375, + "step": 4081 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8938, + "step": 4082 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1559, + "step": 4083 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2144, + "step": 4084 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4366, + "step": 4085 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1172, + "step": 4086 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3419, + "step": 4087 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6779, + "step": 4088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1309, + "step": 4089 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.019, + "step": 4090 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0277, + "step": 4091 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4826, + "step": 4092 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3211, + "step": 4093 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6339, + "step": 4094 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.642, + "step": 4095 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7217, + "step": 4096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.076, + "step": 4097 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5584, + "step": 4098 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3251, + "step": 4099 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9014, + "step": 4100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.184, + "step": 4101 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2203, + "step": 4102 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6179, + "step": 4103 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4954, + "step": 4104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1081, + "step": 4105 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3472, + "step": 4106 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5877, + "step": 4107 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2448, + "step": 4108 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6592, + "step": 4109 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2416, + "step": 4110 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7873, + "step": 4111 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4713, + "step": 4112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3444, + "step": 4113 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6219, + "step": 4114 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2364, + "step": 4115 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3227, + "step": 4116 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9786, + "step": 4117 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1939, + "step": 4118 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.781, + "step": 4119 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4606, + "step": 4120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5705, + "step": 4121 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1414, + "step": 4122 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.79, + "step": 4123 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9486, + "step": 4124 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.341, + "step": 4125 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5912, + "step": 4126 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3169, + "step": 4127 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.329, + "step": 4128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1756, + "step": 4129 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0224, + "step": 4130 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7741, + "step": 4131 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0836, + "step": 4132 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4244, + "step": 4133 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4046, + "step": 4134 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0585, + "step": 4135 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8267, + "step": 4136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8221, + "step": 4137 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.439, + "step": 4138 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7489, + "step": 4139 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1346, + "step": 4140 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3109, + "step": 4141 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6994, + "step": 4142 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7695, + "step": 4143 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5227, + "step": 4144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8782, + "step": 4145 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1273, + "step": 4146 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3641, + "step": 4147 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0747, + "step": 4148 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7932, + "step": 4149 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8848, + "step": 4150 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9808, + "step": 4151 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9369, + "step": 4152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.941, + "step": 4153 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 10.5496, + "step": 4154 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1765, + "step": 4155 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2084, + "step": 4156 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7748, + "step": 4157 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7586, + "step": 4158 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.384, + "step": 4159 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8359, + "step": 4160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2197, + "step": 4161 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6051, + "step": 4162 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4067, + "step": 4163 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8194, + "step": 4164 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8618, + "step": 4165 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6215, + "step": 4166 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.567, + "step": 4167 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8779, + "step": 4168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5998, + "step": 4169 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9171, + "step": 4170 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3797, + "step": 4171 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9968, + "step": 4172 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4823, + "step": 4173 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5109, + "step": 4174 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6709, + "step": 4175 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1483, + "step": 4176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.869, + "step": 4177 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4267, + "step": 4178 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3919, + "step": 4179 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7326, + "step": 4180 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1336, + "step": 4181 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.6834, + "step": 4182 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0282, + "step": 4183 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2146, + "step": 4184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7769, + "step": 4185 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1111, + "step": 4186 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3518, + "step": 4187 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.759, + "step": 4188 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4789, + "step": 4189 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2913, + "step": 4190 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.426, + "step": 4191 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4973, + "step": 4192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7238, + "step": 4193 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3992, + "step": 4194 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9516, + "step": 4195 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7022, + "step": 4196 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8797, + "step": 4197 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1877, + "step": 4198 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6089, + "step": 4199 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1715, + "step": 4200 + }, + { + "epoch": 0.03, + "eval_loss": 6.335043907165527, + "eval_runtime": 22.2336, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 1.124, + "step": 4200 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.16574224386724384, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.1875, + "mmlu_eval_accuracy_business_ethics": 0.2222222222222222, + "mmlu_loss": 3.6120329189300535, + "step": 4200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0172, + "step": 4201 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3208, + "step": 4202 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9768, + "step": 4203 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.224, + "step": 4204 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.112, + "step": 4205 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.701, + "step": 4206 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8528, + "step": 4207 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7786, + "step": 4208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3293, + "step": 4209 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.4986, + "step": 4210 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8651, + "step": 4211 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4758, + "step": 4212 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.97, + "step": 4213 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1929, + "step": 4214 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9504, + "step": 4215 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0213, + "step": 4216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.263, + "step": 4217 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3761, + "step": 4218 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1654, + "step": 4219 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8854, + "step": 4220 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8664, + "step": 4221 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4596, + "step": 4222 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0252, + "step": 4223 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6682, + "step": 4224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1858, + "step": 4225 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6278, + "step": 4226 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0912, + "step": 4227 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2919, + "step": 4228 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7584, + "step": 4229 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9987, + "step": 4230 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4801, + "step": 4231 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4327, + "step": 4232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5746, + "step": 4233 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3162, + "step": 4234 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8182, + "step": 4235 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6069, + "step": 4236 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9851, + "step": 4237 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.267, + "step": 4238 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9602, + "step": 4239 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5938, + "step": 4240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8191, + "step": 4241 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5139, + "step": 4242 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6779, + "step": 4243 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5514, + "step": 4244 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1473, + "step": 4245 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9735, + "step": 4246 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1424, + "step": 4247 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.5365, + "step": 4248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3266, + "step": 4249 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5769, + "step": 4250 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0873, + "step": 4251 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5799, + "step": 4252 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3806, + "step": 4253 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4925, + "step": 4254 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.277, + "step": 4255 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1996, + "step": 4256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8333, + "step": 4257 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8422, + "step": 4258 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9374, + "step": 4259 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2097, + "step": 4260 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2198, + "step": 4261 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4534, + "step": 4262 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4716, + "step": 4263 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0549, + "step": 4264 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4943, + "step": 4265 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.604, + "step": 4266 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6695, + "step": 4267 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1015, + "step": 4268 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6276, + "step": 4269 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.498, + "step": 4270 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7066, + "step": 4271 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7354, + "step": 4272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6121, + "step": 4273 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2452, + "step": 4274 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2953, + "step": 4275 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0474, + "step": 4276 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7764, + "step": 4277 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3297, + "step": 4278 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3487, + "step": 4279 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0156, + "step": 4280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6313, + "step": 4281 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7101, + "step": 4282 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.2139, + "step": 4283 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3695, + "step": 4284 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6284, + "step": 4285 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7277, + "step": 4286 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1922, + "step": 4287 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1183, + "step": 4288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.197, + "step": 4289 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.904, + "step": 4290 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4535, + "step": 4291 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1273, + "step": 4292 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1631, + "step": 4293 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5194, + "step": 4294 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.82, + "step": 4295 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2122, + "step": 4296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4187, + "step": 4297 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0907, + "step": 4298 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1236, + "step": 4299 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0835, + "step": 4300 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9702, + "step": 4301 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1675, + "step": 4302 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8704, + "step": 4303 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2006, + "step": 4304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5152, + "step": 4305 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.511, + "step": 4306 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3767, + "step": 4307 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3041, + "step": 4308 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9436, + "step": 4309 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1931, + "step": 4310 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6127, + "step": 4311 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3527, + "step": 4312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3532, + "step": 4313 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3334, + "step": 4314 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8331, + "step": 4315 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8437, + "step": 4316 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6761, + "step": 4317 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7848, + "step": 4318 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3358, + "step": 4319 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5201, + "step": 4320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5625, + "step": 4321 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5408, + "step": 4322 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6977, + "step": 4323 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5366, + "step": 4324 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9571, + "step": 4325 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2711, + "step": 4326 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3046, + "step": 4327 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2413, + "step": 4328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9967, + "step": 4329 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1353, + "step": 4330 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8216, + "step": 4331 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5285, + "step": 4332 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3631, + "step": 4333 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 1.8248, + "step": 4334 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8453, + "step": 4335 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6769, + "step": 4336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8327, + "step": 4337 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0459, + "step": 4338 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8175, + "step": 4339 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0969, + "step": 4340 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6161, + "step": 4341 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2014, + "step": 4342 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7473, + "step": 4343 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6217, + "step": 4344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9042, + "step": 4345 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1346, + "step": 4346 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.117, + "step": 4347 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3109, + "step": 4348 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2498, + "step": 4349 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6814, + "step": 4350 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.21, + "step": 4351 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8487, + "step": 4352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4287, + "step": 4353 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6428, + "step": 4354 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2915, + "step": 4355 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2039, + "step": 4356 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.711, + "step": 4357 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.483, + "step": 4358 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4743, + "step": 4359 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9889, + "step": 4360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9437, + "step": 4361 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2056, + "step": 4362 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3294, + "step": 4363 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3339, + "step": 4364 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2717, + "step": 4365 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0219, + "step": 4366 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7294, + "step": 4367 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8137, + "step": 4368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.427, + "step": 4369 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5406, + "step": 4370 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7468, + "step": 4371 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6483, + "step": 4372 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0566, + "step": 4373 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3174, + "step": 4374 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6436, + "step": 4375 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4422, + "step": 4376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2401, + "step": 4377 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6105, + "step": 4378 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1438, + "step": 4379 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3631, + "step": 4380 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2301, + "step": 4381 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4289, + "step": 4382 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1199, + "step": 4383 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5042, + "step": 4384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6666, + "step": 4385 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0249, + "step": 4386 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7435, + "step": 4387 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0024, + "step": 4388 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7167, + "step": 4389 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1492, + "step": 4390 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2656, + "step": 4391 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8501, + "step": 4392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8394, + "step": 4393 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.123, + "step": 4394 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7029, + "step": 4395 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9924, + "step": 4396 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8444, + "step": 4397 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.257, + "step": 4398 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6532, + "step": 4399 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4572, + "step": 4400 + }, + { + "epoch": 0.03, + "eval_loss": 6.4130682945251465, + "eval_runtime": 22.2069, + "eval_samples_per_second": 2.252, + "eval_steps_per_second": 1.126, + "step": 4400 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.9351483535766603, + "step": 4400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8926, + "step": 4401 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2074, + "step": 4402 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.2537, + "step": 4403 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3481, + "step": 4404 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0276, + "step": 4405 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0158, + "step": 4406 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3871, + "step": 4407 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.3431, + "step": 4408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3419, + "step": 4409 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3535, + "step": 4410 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7519, + "step": 4411 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.51, + "step": 4412 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7046, + "step": 4413 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2059, + "step": 4414 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8505, + "step": 4415 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6643, + "step": 4416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7986, + "step": 4417 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8461, + "step": 4418 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0336, + "step": 4419 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2912, + "step": 4420 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9896, + "step": 4421 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9749, + "step": 4422 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.81, + "step": 4423 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4518, + "step": 4424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8091, + "step": 4425 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9194, + "step": 4426 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5219, + "step": 4427 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6828, + "step": 4428 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1137, + "step": 4429 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6461, + "step": 4430 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0152, + "step": 4431 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8011, + "step": 4432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4696, + "step": 4433 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8703, + "step": 4434 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5802, + "step": 4435 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6443, + "step": 4436 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4945, + "step": 4437 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8102, + "step": 4438 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1765, + "step": 4439 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9834, + "step": 4440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6708, + "step": 4441 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9588, + "step": 4442 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7362, + "step": 4443 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5623, + "step": 4444 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5016, + "step": 4445 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0908, + "step": 4446 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6242, + "step": 4447 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1595, + "step": 4448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6195, + "step": 4449 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8756, + "step": 4450 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7154, + "step": 4451 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9695, + "step": 4452 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7481, + "step": 4453 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9299, + "step": 4454 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6756, + "step": 4455 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.517, + "step": 4456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8601, + "step": 4457 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0095, + "step": 4458 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9689, + "step": 4459 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8858, + "step": 4460 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5432, + "step": 4461 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0526, + "step": 4462 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7423, + "step": 4463 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2697, + "step": 4464 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6094, + "step": 4465 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4183, + "step": 4466 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8094, + "step": 4467 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8541, + "step": 4468 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7971, + "step": 4469 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7192, + "step": 4470 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3375, + "step": 4471 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9396, + "step": 4472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.445, + "step": 4473 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6076, + "step": 4474 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6831, + "step": 4475 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6578, + "step": 4476 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.405, + "step": 4477 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8647, + "step": 4478 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7002, + "step": 4479 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6372, + "step": 4480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9604, + "step": 4481 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0071, + "step": 4482 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6172, + "step": 4483 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.04, + "step": 4484 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.448, + "step": 4485 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3027, + "step": 4486 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9143, + "step": 4487 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8775, + "step": 4488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7101, + "step": 4489 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6692, + "step": 4490 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8102, + "step": 4491 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6985, + "step": 4492 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.702, + "step": 4493 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7977, + "step": 4494 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.515, + "step": 4495 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5507, + "step": 4496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5842, + "step": 4497 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6957, + "step": 4498 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1066, + "step": 4499 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5859, + "step": 4500 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.451, + "step": 4501 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7379, + "step": 4502 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6385, + "step": 4503 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2374, + "step": 4504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2351, + "step": 4505 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3621, + "step": 4506 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3366, + "step": 4507 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7719, + "step": 4508 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1457, + "step": 4509 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3899, + "step": 4510 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7728, + "step": 4511 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9292, + "step": 4512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6082, + "step": 4513 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2944, + "step": 4514 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6249, + "step": 4515 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5819, + "step": 4516 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9274, + "step": 4517 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0141, + "step": 4518 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7056, + "step": 4519 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9713, + "step": 4520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5392, + "step": 4521 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5142, + "step": 4522 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.251, + "step": 4523 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.143, + "step": 4524 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9271, + "step": 4525 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.038, + "step": 4526 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2392, + "step": 4527 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4011, + "step": 4528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5066, + "step": 4529 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0143, + "step": 4530 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8365, + "step": 4531 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7837, + "step": 4532 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6519, + "step": 4533 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.828, + "step": 4534 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9791, + "step": 4535 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0304, + "step": 4536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6018, + "step": 4537 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3392, + "step": 4538 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3833, + "step": 4539 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.3103, + "step": 4540 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5934, + "step": 4541 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2171, + "step": 4542 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0982, + "step": 4543 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2001, + "step": 4544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.941, + "step": 4545 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6556, + "step": 4546 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9446, + "step": 4547 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.006, + "step": 4548 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4772, + "step": 4549 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8168, + "step": 4550 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9478, + "step": 4551 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5726, + "step": 4552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3606, + "step": 4553 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9709, + "step": 4554 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.863, + "step": 4555 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8277, + "step": 4556 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4027, + "step": 4557 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0372, + "step": 4558 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7037, + "step": 4559 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.864, + "step": 4560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.826, + "step": 4561 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5887, + "step": 4562 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0238, + "step": 4563 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7874, + "step": 4564 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 10.1767, + "step": 4565 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2468, + "step": 4566 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6452, + "step": 4567 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.077, + "step": 4568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2059, + "step": 4569 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.377, + "step": 4570 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.037, + "step": 4571 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8317, + "step": 4572 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3115, + "step": 4573 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0373, + "step": 4574 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.082, + "step": 4575 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7769, + "step": 4576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8292, + "step": 4577 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3808, + "step": 4578 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.84, + "step": 4579 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9102, + "step": 4580 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2409, + "step": 4581 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5787, + "step": 4582 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.043, + "step": 4583 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5522, + "step": 4584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3926, + "step": 4585 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4021, + "step": 4586 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2431, + "step": 4587 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1308, + "step": 4588 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.819, + "step": 4589 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1779, + "step": 4590 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8237, + "step": 4591 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0223, + "step": 4592 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.866, + "step": 4593 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0287, + "step": 4594 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.2001, + "step": 4595 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0228, + "step": 4596 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4247, + "step": 4597 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6213, + "step": 4598 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.1848, + "step": 4599 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.77, + "step": 4600 + }, + { + "epoch": 0.04, + "eval_loss": 6.408684253692627, + "eval_runtime": 22.226, + "eval_samples_per_second": 2.25, + "eval_steps_per_second": 1.125, + "step": 4600 + }, + { + "epoch": 0.04, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.758997039794922, + "step": 4600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.0023, + "step": 4601 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9024, + "step": 4602 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5583, + "step": 4603 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0193, + "step": 4604 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3402, + "step": 4605 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3268, + "step": 4606 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7445, + "step": 4607 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1852, + "step": 4608 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1238, + "step": 4609 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3282, + "step": 4610 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.254, + "step": 4611 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.1053, + "step": 4612 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8592, + "step": 4613 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6462, + "step": 4614 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.669, + "step": 4615 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1189, + "step": 4616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6205, + "step": 4617 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2284, + "step": 4618 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4663, + "step": 4619 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.4695, + "step": 4620 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2312, + "step": 4621 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8074, + "step": 4622 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.1565, + "step": 4623 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7891, + "step": 4624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1226, + "step": 4625 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8755, + "step": 4626 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0405, + "step": 4627 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0656, + "step": 4628 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9288, + "step": 4629 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9022, + "step": 4630 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.2631, + "step": 4631 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.8818, + "step": 4632 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8685, + "step": 4633 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1125, + "step": 4634 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4889, + "step": 4635 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.076, + "step": 4636 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.9786, + "step": 4637 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8958, + "step": 4638 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4134, + "step": 4639 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.088, + "step": 4640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8596, + "step": 4641 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3521, + "step": 4642 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5089, + "step": 4643 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8831, + "step": 4644 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3512, + "step": 4645 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0645, + "step": 4646 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2109, + "step": 4647 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.69, + "step": 4648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9733, + "step": 4649 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3857, + "step": 4650 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9065, + "step": 4651 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.8798, + "step": 4652 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.9107, + "step": 4653 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2948, + "step": 4654 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3601, + "step": 4655 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8386, + "step": 4656 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.4677, + "step": 4657 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.9595, + "step": 4658 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.901, + "step": 4659 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7767, + "step": 4660 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.7727, + "step": 4661 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1065, + "step": 4662 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.663, + "step": 4663 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6291, + "step": 4664 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.2727, + "step": 4665 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8332, + "step": 4666 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5858, + "step": 4667 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.6918, + "step": 4668 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4193, + "step": 4669 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.236, + "step": 4670 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1981, + "step": 4671 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8736, + "step": 4672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9571, + "step": 4673 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5301, + "step": 4674 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5035, + "step": 4675 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3201, + "step": 4676 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2301, + "step": 4677 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5596, + "step": 4678 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1596, + "step": 4679 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6705, + "step": 4680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3611, + "step": 4681 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.2311, + "step": 4682 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1461, + "step": 4683 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8506, + "step": 4684 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1727, + "step": 4685 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8938, + "step": 4686 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.1876, + "step": 4687 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4972, + "step": 4688 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5906, + "step": 4689 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6744, + "step": 4690 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4496, + "step": 4691 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.9415, + "step": 4692 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3577, + "step": 4693 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.672, + "step": 4694 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5867, + "step": 4695 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.4507, + "step": 4696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.015, + "step": 4697 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.532, + "step": 4698 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5171, + "step": 4699 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0666, + "step": 4700 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.8359, + "step": 4701 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6945, + "step": 4702 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.4712, + "step": 4703 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2963, + "step": 4704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4449, + "step": 4705 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.08, + "step": 4706 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.8184, + "step": 4707 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8886, + "step": 4708 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4294, + "step": 4709 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6778, + "step": 4710 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9429, + "step": 4711 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.185, + "step": 4712 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8384, + "step": 4713 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7196, + "step": 4714 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8159, + "step": 4715 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9228, + "step": 4716 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.922, + "step": 4717 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2769, + "step": 4718 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4102, + "step": 4719 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3473, + "step": 4720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.7315, + "step": 4721 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5719, + "step": 4722 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.45, + "step": 4723 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7166, + "step": 4724 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.003, + "step": 4725 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.6474, + "step": 4726 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0281, + "step": 4727 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1768, + "step": 4728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3749, + "step": 4729 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5307, + "step": 4730 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9728, + "step": 4731 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.7612, + "step": 4732 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.0616, + "step": 4733 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.7188, + "step": 4734 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9895, + "step": 4735 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.2942, + "step": 4736 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6735, + "step": 4737 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1246, + "step": 4738 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5913, + "step": 4739 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7382, + "step": 4740 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.629, + "step": 4741 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1264, + "step": 4742 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.5582, + "step": 4743 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9757, + "step": 4744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.4638, + "step": 4745 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9211, + "step": 4746 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5218, + "step": 4747 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3471, + "step": 4748 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4113, + "step": 4749 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4051, + "step": 4750 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8378, + "step": 4751 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.278, + "step": 4752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2717, + "step": 4753 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7359, + "step": 4754 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4031, + "step": 4755 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6597, + "step": 4756 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.394, + "step": 4757 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3462, + "step": 4758 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4558, + "step": 4759 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1341, + "step": 4760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.6349, + "step": 4761 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.3346, + "step": 4762 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.188, + "step": 4763 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.0209, + "step": 4764 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4156, + "step": 4765 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3871, + "step": 4766 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7964, + "step": 4767 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.2041, + "step": 4768 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3015, + "step": 4769 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1091, + "step": 4770 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2722, + "step": 4771 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.0096, + "step": 4772 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2771, + "step": 4773 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5131, + "step": 4774 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8754, + "step": 4775 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0951, + "step": 4776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5389, + "step": 4777 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6913, + "step": 4778 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3431, + "step": 4779 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3538, + "step": 4780 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.4533, + "step": 4781 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6601, + "step": 4782 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.9653, + "step": 4783 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0344, + "step": 4784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0719, + "step": 4785 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3465, + "step": 4786 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5308, + "step": 4787 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9054, + "step": 4788 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2575, + "step": 4789 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.905, + "step": 4790 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.7652, + "step": 4791 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.3971, + "step": 4792 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9916, + "step": 4793 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3688, + "step": 4794 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9891, + "step": 4795 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7919, + "step": 4796 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2468, + "step": 4797 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1588, + "step": 4798 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.0453, + "step": 4799 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7421, + "step": 4800 + }, + { + "epoch": 0.04, + "eval_loss": 6.6897292137146, + "eval_runtime": 22.2665, + "eval_samples_per_second": 2.246, + "eval_steps_per_second": 1.123, + "step": 4800 + }, + { + "epoch": 0.04, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 4.275100479125976, + "step": 4800 + }, + { + "epoch": 0.04, + "step": 4800, + "total_flos": 7.930864121570918e+16, + "train_loss": 1.2399261393149694, + "train_runtime": 2886.6567, + "train_samples_per_second": 10.393, + "train_steps_per_second": 10.393 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.196, + "step": 4801 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4766, + "step": 4802 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5177, + "step": 4803 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6057, + "step": 4804 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9972, + "step": 4805 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.3521, + "step": 4806 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9037, + "step": 4807 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.967, + "step": 4808 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2405, + "step": 4809 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.5253, + "step": 4810 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2458, + "step": 4811 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.179, + "step": 4812 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6969, + "step": 4813 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.2289, + "step": 4814 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0946, + "step": 4815 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9045, + "step": 4816 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.6952, + "step": 4817 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7265, + "step": 4818 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9574, + "step": 4819 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.3774, + "step": 4820 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7837, + "step": 4821 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3796, + "step": 4822 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4443, + "step": 4823 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.7734, + "step": 4824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.5535, + "step": 4825 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1014, + "step": 4826 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5574, + "step": 4827 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5114, + "step": 4828 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.431, + "step": 4829 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8042, + "step": 4830 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.4997, + "step": 4831 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9027, + "step": 4832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7126, + "step": 4833 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8638, + "step": 4834 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4997, + "step": 4835 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.8501, + "step": 4836 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6346, + "step": 4837 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8403, + "step": 4838 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8362, + "step": 4839 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8393, + "step": 4840 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6428, + "step": 4841 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9946, + "step": 4842 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3163, + "step": 4843 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9659, + "step": 4844 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5787, + "step": 4845 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.0435, + "step": 4846 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6627, + "step": 4847 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0435, + "step": 4848 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.584, + "step": 4849 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.5761, + "step": 4850 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0644, + "step": 4851 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7897, + "step": 4852 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.7933, + "step": 4853 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0918, + "step": 4854 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1191, + "step": 4855 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6498, + "step": 4856 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.1834, + "step": 4857 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5713, + "step": 4858 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.643, + "step": 4859 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.0051, + "step": 4860 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3192, + "step": 4861 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6787, + "step": 4862 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1336, + "step": 4863 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.1196, + "step": 4864 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.7662, + "step": 4865 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5099, + "step": 4866 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4698, + "step": 4867 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9245, + "step": 4868 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0627, + "step": 4869 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.4951, + "step": 4870 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.7325, + "step": 4871 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3552, + "step": 4872 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.0359, + "step": 4873 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.259, + "step": 4874 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.39, + "step": 4875 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.2283, + "step": 4876 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2338, + "step": 4877 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5157, + "step": 4878 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7822, + "step": 4879 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4882, + "step": 4880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9394, + "step": 4881 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.7844, + "step": 4882 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.0808, + "step": 4883 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4349, + "step": 4884 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7747, + "step": 4885 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.2778, + "step": 4886 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9457, + "step": 4887 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.196, + "step": 4888 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2576, + "step": 4889 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3978, + "step": 4890 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4963, + "step": 4891 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.8915, + "step": 4892 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8344, + "step": 4893 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5248, + "step": 4894 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9729, + "step": 4895 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.7504, + "step": 4896 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.4288, + "step": 4897 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.173, + "step": 4898 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6288, + "step": 4899 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.3934, + "step": 4900 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8056, + "step": 4901 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7523, + "step": 4902 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6066, + "step": 4903 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.6161, + "step": 4904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8099, + "step": 4905 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.2048, + "step": 4906 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6112, + "step": 4907 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.5394, + "step": 4908 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9661, + "step": 4909 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1945, + "step": 4910 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1823, + "step": 4911 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5774, + "step": 4912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.3444, + "step": 4913 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8732, + "step": 4914 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5685, + "step": 4915 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.6944, + "step": 4916 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.9668, + "step": 4917 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9854, + "step": 4918 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.0986, + "step": 4919 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8546, + "step": 4920 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.658, + "step": 4921 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8595, + "step": 4922 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3526, + "step": 4923 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6612, + "step": 4924 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4798, + "step": 4925 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.0779, + "step": 4926 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8211, + "step": 4927 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9007, + "step": 4928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.5789, + "step": 4929 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0357, + "step": 4930 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8846, + "step": 4931 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7409, + "step": 4932 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.4081, + "step": 4933 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.3187, + "step": 4934 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1926, + "step": 4935 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.2912, + "step": 4936 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6701, + "step": 4937 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.8162, + "step": 4938 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.7585, + "step": 4939 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6232, + "step": 4940 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9613, + "step": 4941 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.954, + "step": 4942 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.7287, + "step": 4943 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.6305, + "step": 4944 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6932, + "step": 4945 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6798, + "step": 4946 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6665, + "step": 4947 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1462, + "step": 4948 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.0676, + "step": 4949 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2834, + "step": 4950 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8273, + "step": 4951 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.635, + "step": 4952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.9245, + "step": 4953 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.5401, + "step": 4954 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.2944, + "step": 4955 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6151, + "step": 4956 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3668, + "step": 4957 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4506, + "step": 4958 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8919, + "step": 4959 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.7462, + "step": 4960 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8915, + "step": 4961 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4696, + "step": 4962 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0112, + "step": 4963 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1888, + "step": 4964 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1465, + "step": 4965 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6028, + "step": 4966 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.279, + "step": 4967 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4619, + "step": 4968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7617, + "step": 4969 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0521, + "step": 4970 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.9583, + "step": 4971 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.5725, + "step": 4972 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3248, + "step": 4973 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9984, + "step": 4974 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7955, + "step": 4975 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4351, + "step": 4976 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5412, + "step": 4977 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.4986, + "step": 4978 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.4686, + "step": 4979 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3709, + "step": 4980 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.4326, + "step": 4981 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3118, + "step": 4982 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2933, + "step": 4983 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2728, + "step": 4984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.5518, + "step": 4985 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8085, + "step": 4986 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.024, + "step": 4987 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7633, + "step": 4988 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.9099, + "step": 4989 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0304, + "step": 4990 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.939, + "step": 4991 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.1024, + "step": 4992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.2432, + "step": 4993 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9213, + "step": 4994 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.6644, + "step": 4995 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9821, + "step": 4996 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.9677, + "step": 4997 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7992, + "step": 4998 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.2743, + "step": 4999 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5054, + "step": 5000 + }, + { + "epoch": 0.04, + "eval_loss": 6.463876247406006, + "eval_runtime": 22.4171, + "eval_samples_per_second": 2.23, + "eval_steps_per_second": 1.115, + "step": 5000 + }, + { + "epoch": 0.04, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.4964506435394287, + "step": 5000 } ], "max_steps": 30000, "num_train_epochs": 1, - "total_flos": 5.872515269482906e+16, + "total_flos": 8.241807132308275e+16, "trial_name": null, "trial_params": null } diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cdd9dec0839dd55328a8e59cbb2bcf7c0315c309 --- /dev/null +++ b/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd2e97ea6d2b8e9c0ed7efbb032ce79458292aa99ced1bbeb7b777b9663a324f +size 6011 diff --git a/eval_results.json b/eval_results.json index a78d025bcd9bf3439ee44df8795d35c6a1e0acf9..fa213d7d8ccd3bc2b98fa96c705483c866c1adb0 100644 --- a/eval_results.json +++ b/eval_results.json @@ -1,7 +1,7 @@ { - "epoch": 0.03, - "eval_loss": 6.423073768615723, - "eval_runtime": 22.3351, - "eval_samples_per_second": 2.239, - "eval_steps_per_second": 1.119 + "epoch": 0.04, + "eval_loss": 6.335043907165527, + "eval_runtime": 21.5795, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 1.159 } \ No newline at end of file diff --git a/metrics.json b/metrics.json index 504cb99cab028a26d54535227ac0b5c48bb1927b..cb38c28ccccd0212195b7bfbdcea8d5af721c1af 100644 --- a/metrics.json +++ b/metrics.json @@ -1 +1 @@ -{"run_name": "codellama34b_unnatural", "train_runtime": 2842.6735, "train_samples_per_second": 10.553, "train_steps_per_second": 10.553, "train_loss": 1.5118552861401908, "epoch": 0.03, "eval_loss": 6.423073768615723, "eval_runtime": 22.3351, "eval_samples_per_second": 2.239, "eval_steps_per_second": 1.119} \ No newline at end of file +{"run_name": "codellama34b_unnatural", "train_runtime": 2886.6567, "train_samples_per_second": 10.393, "train_steps_per_second": 10.393, "train_loss": 1.2399261393149694, "epoch": 0.04, "eval_loss": 6.335043907165527, "eval_runtime": 21.5795, "eval_samples_per_second": 2.317, "eval_steps_per_second": 1.159} \ No newline at end of file diff --git a/train_results.json b/train_results.json index 60bb344ba4bea7d013a207d5ad4c16f18a079bd3..26ff36e522df08be92b7a502e9504b1d01a7535a 100644 --- a/train_results.json +++ b/train_results.json @@ -1,7 +1,7 @@ { - "epoch": 0.03, - "train_loss": 1.5118552861401908, - "train_runtime": 2842.6735, - "train_samples_per_second": 10.553, - "train_steps_per_second": 10.553 + "epoch": 0.04, + "train_loss": 1.2399261393149694, + "train_runtime": 2886.6567, + "train_samples_per_second": 10.393, + "train_steps_per_second": 10.393 } \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json index 0b1a657780a084daff5e7cbcff8ffb0aee3caeed..d75042bd7b8128e7dc370ba6c733f216769f6dd1 100644 --- a/trainer_state.json +++ b/trainer_state.json @@ -1,8 +1,8 @@ { - "best_metric": 6.423073768615723, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-3200", - "epoch": 0.02902757619738752, - "global_step": 3800, + "best_metric": 6.335043907165527, + "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-4200", + "epoch": 0.03666641203880529, + "global_step": 4800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -23166,11 +23166,6110 @@ "train_runtime": 2842.6735, "train_samples_per_second": 10.553, "train_steps_per_second": 10.553 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.59, + "step": 3801 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0344, + "step": 3802 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9693, + "step": 3803 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4471, + "step": 3804 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2325, + "step": 3805 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7491, + "step": 3806 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3688, + "step": 3807 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9074, + "step": 3808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.092, + "step": 3809 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3568, + "step": 3810 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5895, + "step": 3811 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9071, + "step": 3812 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4571, + "step": 3813 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8045, + "step": 3814 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1976, + "step": 3815 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.356, + "step": 3816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3433, + "step": 3817 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5517, + "step": 3818 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0198, + "step": 3819 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4643, + "step": 3820 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9985, + "step": 3821 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4659, + "step": 3822 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7302, + "step": 3823 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3538, + "step": 3824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3654, + "step": 3825 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9622, + "step": 3826 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5866, + "step": 3827 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7111, + "step": 3828 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4568, + "step": 3829 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9525, + "step": 3830 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1796, + "step": 3831 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8362, + "step": 3832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2532, + "step": 3833 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2513, + "step": 3834 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.244, + "step": 3835 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1098, + "step": 3836 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6361, + "step": 3837 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1826, + "step": 3838 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.157, + "step": 3839 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6614, + "step": 3840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.0458, + "step": 3841 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.022, + "step": 3842 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5359, + "step": 3843 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9234, + "step": 3844 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6395, + "step": 3845 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8419, + "step": 3846 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8434, + "step": 3847 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.207, + "step": 3848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3591, + "step": 3849 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8356, + "step": 3850 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9219, + "step": 3851 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3402, + "step": 3852 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8806, + "step": 3853 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5234, + "step": 3854 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7777, + "step": 3855 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1154, + "step": 3856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5921, + "step": 3857 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7223, + "step": 3858 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 10.4473, + "step": 3859 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4797, + "step": 3860 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1393, + "step": 3861 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1977, + "step": 3862 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5573, + "step": 3863 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8899, + "step": 3864 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7704, + "step": 3865 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6136, + "step": 3866 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7599, + "step": 3867 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0662, + "step": 3868 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0538, + "step": 3869 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.934, + "step": 3870 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8072, + "step": 3871 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3782, + "step": 3872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7663, + "step": 3873 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.486, + "step": 3874 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7332, + "step": 3875 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5108, + "step": 3876 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6332, + "step": 3877 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5284, + "step": 3878 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5711, + "step": 3879 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.9677, + "step": 3880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7378, + "step": 3881 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3763, + "step": 3882 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7173, + "step": 3883 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2495, + "step": 3884 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1252, + "step": 3885 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8202, + "step": 3886 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5098, + "step": 3887 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6304, + "step": 3888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6211, + "step": 3889 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6213, + "step": 3890 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6606, + "step": 3891 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0749, + "step": 3892 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8867, + "step": 3893 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6526, + "step": 3894 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7102, + "step": 3895 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2593, + "step": 3896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2783, + "step": 3897 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1808, + "step": 3898 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5008, + "step": 3899 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8244, + "step": 3900 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8664, + "step": 3901 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3084, + "step": 3902 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8537, + "step": 3903 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1877, + "step": 3904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6313, + "step": 3905 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2842, + "step": 3906 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6201, + "step": 3907 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6035, + "step": 3908 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.5118, + "step": 3909 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8019, + "step": 3910 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4406, + "step": 3911 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1105, + "step": 3912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3885, + "step": 3913 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5057, + "step": 3914 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8441, + "step": 3915 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5325, + "step": 3916 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9075, + "step": 3917 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9497, + "step": 3918 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4994, + "step": 3919 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3098, + "step": 3920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0306, + "step": 3921 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1548, + "step": 3922 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9829, + "step": 3923 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1177, + "step": 3924 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1212, + "step": 3925 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9675, + "step": 3926 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6422, + "step": 3927 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0774, + "step": 3928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5982, + "step": 3929 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9857, + "step": 3930 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1237, + "step": 3931 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7049, + "step": 3932 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1052, + "step": 3933 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.0896, + "step": 3934 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0104, + "step": 3935 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4442, + "step": 3936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1093, + "step": 3937 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5394, + "step": 3938 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0815, + "step": 3939 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1694, + "step": 3940 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6317, + "step": 3941 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6136, + "step": 3942 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7063, + "step": 3943 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9645, + "step": 3944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2328, + "step": 3945 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8863, + "step": 3946 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7473, + "step": 3947 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9884, + "step": 3948 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2287, + "step": 3949 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3459, + "step": 3950 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5786, + "step": 3951 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2403, + "step": 3952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0499, + "step": 3953 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1844, + "step": 3954 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6237, + "step": 3955 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.72, + "step": 3956 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3407, + "step": 3957 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0939, + "step": 3958 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5248, + "step": 3959 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3676, + "step": 3960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6449, + "step": 3961 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1033, + "step": 3962 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9184, + "step": 3963 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0001, + "step": 3964 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7925, + "step": 3965 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1872, + "step": 3966 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0878, + "step": 3967 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2376, + "step": 3968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6891, + "step": 3969 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9365, + "step": 3970 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9592, + "step": 3971 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.051, + "step": 3972 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5806, + "step": 3973 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6242, + "step": 3974 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3711, + "step": 3975 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3724, + "step": 3976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.681, + "step": 3977 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9895, + "step": 3978 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1728, + "step": 3979 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6712, + "step": 3980 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2322, + "step": 3981 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8414, + "step": 3982 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2529, + "step": 3983 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3793, + "step": 3984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4325, + "step": 3985 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5571, + "step": 3986 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8588, + "step": 3987 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0937, + "step": 3988 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4595, + "step": 3989 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2496, + "step": 3990 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0736, + "step": 3991 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1827, + "step": 3992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2944, + "step": 3993 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2309, + "step": 3994 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5547, + "step": 3995 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5263, + "step": 3996 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1416, + "step": 3997 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9477, + "step": 3998 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8041, + "step": 3999 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8932, + "step": 4000 + }, + { + "epoch": 0.03, + "eval_loss": 6.467012405395508, + "eval_runtime": 22.2494, + "eval_samples_per_second": 2.247, + "eval_steps_per_second": 1.124, + "step": 4000 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.5710299587249756, + "step": 4000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.07, + "step": 4001 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1685, + "step": 4002 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7579, + "step": 4003 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6209, + "step": 4004 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1189, + "step": 4005 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4346, + "step": 4006 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0016, + "step": 4007 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1685, + "step": 4008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3534, + "step": 4009 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7965, + "step": 4010 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4994, + "step": 4011 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1394, + "step": 4012 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3169, + "step": 4013 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.261, + "step": 4014 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8362, + "step": 4015 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8274, + "step": 4016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5416, + "step": 4017 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6363, + "step": 4018 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1583, + "step": 4019 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1575, + "step": 4020 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0904, + "step": 4021 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0278, + "step": 4022 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8278, + "step": 4023 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.535, + "step": 4024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3822, + "step": 4025 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0538, + "step": 4026 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2228, + "step": 4027 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.509, + "step": 4028 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4223, + "step": 4029 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1033, + "step": 4030 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1594, + "step": 4031 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3259, + "step": 4032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1647, + "step": 4033 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3303, + "step": 4034 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5812, + "step": 4035 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4741, + "step": 4036 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8335, + "step": 4037 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8835, + "step": 4038 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3226, + "step": 4039 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.3215, + "step": 4040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5474, + "step": 4041 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7139, + "step": 4042 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6725, + "step": 4043 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6821, + "step": 4044 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8876, + "step": 4045 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4142, + "step": 4046 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4157, + "step": 4047 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.441, + "step": 4048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8809, + "step": 4049 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8153, + "step": 4050 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1665, + "step": 4051 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5073, + "step": 4052 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4909, + "step": 4053 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9506, + "step": 4054 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0875, + "step": 4055 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3291, + "step": 4056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6951, + "step": 4057 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7277, + "step": 4058 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4189, + "step": 4059 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9827, + "step": 4060 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0741, + "step": 4061 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0725, + "step": 4062 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9464, + "step": 4063 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4358, + "step": 4064 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4715, + "step": 4065 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3786, + "step": 4066 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4934, + "step": 4067 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2759, + "step": 4068 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6035, + "step": 4069 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2499, + "step": 4070 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1888, + "step": 4071 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0456, + "step": 4072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.951, + "step": 4073 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4353, + "step": 4074 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7801, + "step": 4075 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7917, + "step": 4076 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3531, + "step": 4077 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6331, + "step": 4078 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5221, + "step": 4079 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1318, + "step": 4080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2375, + "step": 4081 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8938, + "step": 4082 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1559, + "step": 4083 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2144, + "step": 4084 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4366, + "step": 4085 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1172, + "step": 4086 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3419, + "step": 4087 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6779, + "step": 4088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1309, + "step": 4089 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.019, + "step": 4090 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0277, + "step": 4091 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4826, + "step": 4092 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3211, + "step": 4093 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6339, + "step": 4094 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.642, + "step": 4095 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7217, + "step": 4096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.076, + "step": 4097 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5584, + "step": 4098 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3251, + "step": 4099 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9014, + "step": 4100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.184, + "step": 4101 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2203, + "step": 4102 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6179, + "step": 4103 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4954, + "step": 4104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1081, + "step": 4105 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3472, + "step": 4106 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5877, + "step": 4107 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2448, + "step": 4108 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6592, + "step": 4109 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2416, + "step": 4110 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7873, + "step": 4111 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4713, + "step": 4112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3444, + "step": 4113 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6219, + "step": 4114 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2364, + "step": 4115 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3227, + "step": 4116 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9786, + "step": 4117 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1939, + "step": 4118 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.781, + "step": 4119 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4606, + "step": 4120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5705, + "step": 4121 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1414, + "step": 4122 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.79, + "step": 4123 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9486, + "step": 4124 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.341, + "step": 4125 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5912, + "step": 4126 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3169, + "step": 4127 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.329, + "step": 4128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1756, + "step": 4129 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0224, + "step": 4130 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7741, + "step": 4131 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0836, + "step": 4132 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4244, + "step": 4133 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4046, + "step": 4134 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0585, + "step": 4135 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8267, + "step": 4136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8221, + "step": 4137 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.439, + "step": 4138 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7489, + "step": 4139 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1346, + "step": 4140 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3109, + "step": 4141 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6994, + "step": 4142 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7695, + "step": 4143 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5227, + "step": 4144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8782, + "step": 4145 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1273, + "step": 4146 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3641, + "step": 4147 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0747, + "step": 4148 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7932, + "step": 4149 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8848, + "step": 4150 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9808, + "step": 4151 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9369, + "step": 4152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.941, + "step": 4153 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 10.5496, + "step": 4154 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1765, + "step": 4155 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2084, + "step": 4156 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7748, + "step": 4157 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7586, + "step": 4158 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.384, + "step": 4159 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8359, + "step": 4160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2197, + "step": 4161 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6051, + "step": 4162 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4067, + "step": 4163 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8194, + "step": 4164 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8618, + "step": 4165 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6215, + "step": 4166 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.567, + "step": 4167 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8779, + "step": 4168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5998, + "step": 4169 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9171, + "step": 4170 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3797, + "step": 4171 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9968, + "step": 4172 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4823, + "step": 4173 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5109, + "step": 4174 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6709, + "step": 4175 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1483, + "step": 4176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.869, + "step": 4177 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4267, + "step": 4178 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3919, + "step": 4179 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7326, + "step": 4180 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1336, + "step": 4181 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.6834, + "step": 4182 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0282, + "step": 4183 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2146, + "step": 4184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7769, + "step": 4185 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1111, + "step": 4186 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3518, + "step": 4187 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.759, + "step": 4188 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4789, + "step": 4189 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2913, + "step": 4190 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.426, + "step": 4191 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4973, + "step": 4192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7238, + "step": 4193 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3992, + "step": 4194 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9516, + "step": 4195 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7022, + "step": 4196 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8797, + "step": 4197 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1877, + "step": 4198 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6089, + "step": 4199 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1715, + "step": 4200 + }, + { + "epoch": 0.03, + "eval_loss": 6.335043907165527, + "eval_runtime": 22.2336, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 1.124, + "step": 4200 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.16574224386724384, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.1875, + "mmlu_eval_accuracy_business_ethics": 0.2222222222222222, + "mmlu_loss": 3.6120329189300535, + "step": 4200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0172, + "step": 4201 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3208, + "step": 4202 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9768, + "step": 4203 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.224, + "step": 4204 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.112, + "step": 4205 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.701, + "step": 4206 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8528, + "step": 4207 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7786, + "step": 4208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3293, + "step": 4209 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.4986, + "step": 4210 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8651, + "step": 4211 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4758, + "step": 4212 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.97, + "step": 4213 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1929, + "step": 4214 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9504, + "step": 4215 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0213, + "step": 4216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.263, + "step": 4217 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3761, + "step": 4218 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1654, + "step": 4219 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8854, + "step": 4220 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8664, + "step": 4221 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4596, + "step": 4222 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0252, + "step": 4223 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6682, + "step": 4224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1858, + "step": 4225 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6278, + "step": 4226 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0912, + "step": 4227 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2919, + "step": 4228 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7584, + "step": 4229 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9987, + "step": 4230 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4801, + "step": 4231 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4327, + "step": 4232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5746, + "step": 4233 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3162, + "step": 4234 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8182, + "step": 4235 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6069, + "step": 4236 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9851, + "step": 4237 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.267, + "step": 4238 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9602, + "step": 4239 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5938, + "step": 4240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8191, + "step": 4241 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5139, + "step": 4242 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6779, + "step": 4243 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5514, + "step": 4244 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1473, + "step": 4245 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9735, + "step": 4246 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1424, + "step": 4247 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.5365, + "step": 4248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3266, + "step": 4249 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5769, + "step": 4250 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0873, + "step": 4251 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5799, + "step": 4252 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3806, + "step": 4253 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4925, + "step": 4254 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.277, + "step": 4255 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1996, + "step": 4256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8333, + "step": 4257 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8422, + "step": 4258 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9374, + "step": 4259 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2097, + "step": 4260 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2198, + "step": 4261 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4534, + "step": 4262 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4716, + "step": 4263 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0549, + "step": 4264 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4943, + "step": 4265 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.604, + "step": 4266 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6695, + "step": 4267 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1015, + "step": 4268 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6276, + "step": 4269 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.498, + "step": 4270 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7066, + "step": 4271 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7354, + "step": 4272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6121, + "step": 4273 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2452, + "step": 4274 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2953, + "step": 4275 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0474, + "step": 4276 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7764, + "step": 4277 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3297, + "step": 4278 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3487, + "step": 4279 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0156, + "step": 4280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6313, + "step": 4281 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7101, + "step": 4282 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.2139, + "step": 4283 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3695, + "step": 4284 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6284, + "step": 4285 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7277, + "step": 4286 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1922, + "step": 4287 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1183, + "step": 4288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.197, + "step": 4289 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.904, + "step": 4290 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4535, + "step": 4291 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1273, + "step": 4292 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1631, + "step": 4293 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5194, + "step": 4294 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.82, + "step": 4295 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2122, + "step": 4296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4187, + "step": 4297 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0907, + "step": 4298 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1236, + "step": 4299 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0835, + "step": 4300 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9702, + "step": 4301 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1675, + "step": 4302 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8704, + "step": 4303 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2006, + "step": 4304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5152, + "step": 4305 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.511, + "step": 4306 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3767, + "step": 4307 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3041, + "step": 4308 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9436, + "step": 4309 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1931, + "step": 4310 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6127, + "step": 4311 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3527, + "step": 4312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3532, + "step": 4313 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3334, + "step": 4314 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8331, + "step": 4315 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8437, + "step": 4316 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6761, + "step": 4317 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7848, + "step": 4318 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3358, + "step": 4319 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5201, + "step": 4320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5625, + "step": 4321 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5408, + "step": 4322 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6977, + "step": 4323 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5366, + "step": 4324 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9571, + "step": 4325 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2711, + "step": 4326 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3046, + "step": 4327 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2413, + "step": 4328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9967, + "step": 4329 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1353, + "step": 4330 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8216, + "step": 4331 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5285, + "step": 4332 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3631, + "step": 4333 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 1.8248, + "step": 4334 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8453, + "step": 4335 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6769, + "step": 4336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8327, + "step": 4337 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0459, + "step": 4338 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8175, + "step": 4339 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0969, + "step": 4340 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6161, + "step": 4341 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2014, + "step": 4342 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7473, + "step": 4343 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6217, + "step": 4344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9042, + "step": 4345 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1346, + "step": 4346 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.117, + "step": 4347 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3109, + "step": 4348 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2498, + "step": 4349 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6814, + "step": 4350 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.21, + "step": 4351 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8487, + "step": 4352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4287, + "step": 4353 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6428, + "step": 4354 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2915, + "step": 4355 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2039, + "step": 4356 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.711, + "step": 4357 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.483, + "step": 4358 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4743, + "step": 4359 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9889, + "step": 4360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9437, + "step": 4361 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2056, + "step": 4362 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3294, + "step": 4363 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3339, + "step": 4364 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2717, + "step": 4365 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0219, + "step": 4366 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7294, + "step": 4367 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8137, + "step": 4368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.427, + "step": 4369 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5406, + "step": 4370 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7468, + "step": 4371 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6483, + "step": 4372 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0566, + "step": 4373 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3174, + "step": 4374 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6436, + "step": 4375 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4422, + "step": 4376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2401, + "step": 4377 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6105, + "step": 4378 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1438, + "step": 4379 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3631, + "step": 4380 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2301, + "step": 4381 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4289, + "step": 4382 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1199, + "step": 4383 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5042, + "step": 4384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6666, + "step": 4385 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0249, + "step": 4386 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7435, + "step": 4387 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0024, + "step": 4388 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7167, + "step": 4389 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1492, + "step": 4390 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2656, + "step": 4391 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8501, + "step": 4392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8394, + "step": 4393 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.123, + "step": 4394 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7029, + "step": 4395 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9924, + "step": 4396 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8444, + "step": 4397 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.257, + "step": 4398 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6532, + "step": 4399 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4572, + "step": 4400 + }, + { + "epoch": 0.03, + "eval_loss": 6.4130682945251465, + "eval_runtime": 22.2069, + "eval_samples_per_second": 2.252, + "eval_steps_per_second": 1.126, + "step": 4400 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.9351483535766603, + "step": 4400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8926, + "step": 4401 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2074, + "step": 4402 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.2537, + "step": 4403 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3481, + "step": 4404 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0276, + "step": 4405 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0158, + "step": 4406 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3871, + "step": 4407 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.3431, + "step": 4408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3419, + "step": 4409 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3535, + "step": 4410 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7519, + "step": 4411 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.51, + "step": 4412 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7046, + "step": 4413 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2059, + "step": 4414 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8505, + "step": 4415 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6643, + "step": 4416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7986, + "step": 4417 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8461, + "step": 4418 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0336, + "step": 4419 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2912, + "step": 4420 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9896, + "step": 4421 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9749, + "step": 4422 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.81, + "step": 4423 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4518, + "step": 4424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8091, + "step": 4425 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9194, + "step": 4426 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5219, + "step": 4427 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6828, + "step": 4428 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1137, + "step": 4429 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6461, + "step": 4430 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0152, + "step": 4431 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8011, + "step": 4432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4696, + "step": 4433 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8703, + "step": 4434 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5802, + "step": 4435 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6443, + "step": 4436 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4945, + "step": 4437 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8102, + "step": 4438 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1765, + "step": 4439 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9834, + "step": 4440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6708, + "step": 4441 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9588, + "step": 4442 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7362, + "step": 4443 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5623, + "step": 4444 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5016, + "step": 4445 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0908, + "step": 4446 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6242, + "step": 4447 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1595, + "step": 4448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6195, + "step": 4449 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8756, + "step": 4450 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7154, + "step": 4451 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9695, + "step": 4452 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7481, + "step": 4453 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9299, + "step": 4454 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6756, + "step": 4455 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.517, + "step": 4456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8601, + "step": 4457 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0095, + "step": 4458 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9689, + "step": 4459 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8858, + "step": 4460 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5432, + "step": 4461 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0526, + "step": 4462 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7423, + "step": 4463 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2697, + "step": 4464 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6094, + "step": 4465 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4183, + "step": 4466 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8094, + "step": 4467 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8541, + "step": 4468 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7971, + "step": 4469 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7192, + "step": 4470 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3375, + "step": 4471 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9396, + "step": 4472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.445, + "step": 4473 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6076, + "step": 4474 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6831, + "step": 4475 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6578, + "step": 4476 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.405, + "step": 4477 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8647, + "step": 4478 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7002, + "step": 4479 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6372, + "step": 4480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9604, + "step": 4481 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0071, + "step": 4482 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6172, + "step": 4483 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.04, + "step": 4484 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.448, + "step": 4485 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3027, + "step": 4486 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9143, + "step": 4487 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8775, + "step": 4488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7101, + "step": 4489 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6692, + "step": 4490 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8102, + "step": 4491 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6985, + "step": 4492 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.702, + "step": 4493 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7977, + "step": 4494 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.515, + "step": 4495 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5507, + "step": 4496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5842, + "step": 4497 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6957, + "step": 4498 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1066, + "step": 4499 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5859, + "step": 4500 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.451, + "step": 4501 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7379, + "step": 4502 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6385, + "step": 4503 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2374, + "step": 4504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2351, + "step": 4505 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3621, + "step": 4506 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3366, + "step": 4507 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7719, + "step": 4508 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1457, + "step": 4509 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3899, + "step": 4510 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7728, + "step": 4511 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9292, + "step": 4512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6082, + "step": 4513 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2944, + "step": 4514 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6249, + "step": 4515 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5819, + "step": 4516 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9274, + "step": 4517 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0141, + "step": 4518 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7056, + "step": 4519 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9713, + "step": 4520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5392, + "step": 4521 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5142, + "step": 4522 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.251, + "step": 4523 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.143, + "step": 4524 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9271, + "step": 4525 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.038, + "step": 4526 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2392, + "step": 4527 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4011, + "step": 4528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5066, + "step": 4529 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0143, + "step": 4530 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8365, + "step": 4531 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7837, + "step": 4532 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6519, + "step": 4533 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.828, + "step": 4534 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9791, + "step": 4535 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0304, + "step": 4536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6018, + "step": 4537 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3392, + "step": 4538 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3833, + "step": 4539 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.3103, + "step": 4540 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5934, + "step": 4541 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2171, + "step": 4542 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0982, + "step": 4543 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2001, + "step": 4544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.941, + "step": 4545 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6556, + "step": 4546 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9446, + "step": 4547 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.006, + "step": 4548 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4772, + "step": 4549 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8168, + "step": 4550 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9478, + "step": 4551 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5726, + "step": 4552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3606, + "step": 4553 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9709, + "step": 4554 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.863, + "step": 4555 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8277, + "step": 4556 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4027, + "step": 4557 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0372, + "step": 4558 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7037, + "step": 4559 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.864, + "step": 4560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.826, + "step": 4561 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5887, + "step": 4562 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0238, + "step": 4563 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7874, + "step": 4564 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 10.1767, + "step": 4565 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2468, + "step": 4566 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6452, + "step": 4567 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.077, + "step": 4568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2059, + "step": 4569 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.377, + "step": 4570 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.037, + "step": 4571 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8317, + "step": 4572 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3115, + "step": 4573 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0373, + "step": 4574 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.082, + "step": 4575 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7769, + "step": 4576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8292, + "step": 4577 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3808, + "step": 4578 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.84, + "step": 4579 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9102, + "step": 4580 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2409, + "step": 4581 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5787, + "step": 4582 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.043, + "step": 4583 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5522, + "step": 4584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3926, + "step": 4585 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4021, + "step": 4586 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2431, + "step": 4587 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1308, + "step": 4588 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.819, + "step": 4589 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1779, + "step": 4590 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8237, + "step": 4591 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0223, + "step": 4592 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.866, + "step": 4593 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0287, + "step": 4594 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.2001, + "step": 4595 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0228, + "step": 4596 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4247, + "step": 4597 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6213, + "step": 4598 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.1848, + "step": 4599 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.77, + "step": 4600 + }, + { + "epoch": 0.04, + "eval_loss": 6.408684253692627, + "eval_runtime": 22.226, + "eval_samples_per_second": 2.25, + "eval_steps_per_second": 1.125, + "step": 4600 + }, + { + "epoch": 0.04, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.758997039794922, + "step": 4600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.0023, + "step": 4601 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9024, + "step": 4602 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5583, + "step": 4603 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0193, + "step": 4604 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3402, + "step": 4605 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3268, + "step": 4606 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7445, + "step": 4607 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1852, + "step": 4608 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1238, + "step": 4609 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3282, + "step": 4610 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.254, + "step": 4611 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.1053, + "step": 4612 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8592, + "step": 4613 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6462, + "step": 4614 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.669, + "step": 4615 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1189, + "step": 4616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6205, + "step": 4617 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2284, + "step": 4618 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4663, + "step": 4619 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.4695, + "step": 4620 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2312, + "step": 4621 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8074, + "step": 4622 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.1565, + "step": 4623 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7891, + "step": 4624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1226, + "step": 4625 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8755, + "step": 4626 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0405, + "step": 4627 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0656, + "step": 4628 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9288, + "step": 4629 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9022, + "step": 4630 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.2631, + "step": 4631 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.8818, + "step": 4632 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8685, + "step": 4633 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1125, + "step": 4634 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4889, + "step": 4635 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.076, + "step": 4636 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.9786, + "step": 4637 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8958, + "step": 4638 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4134, + "step": 4639 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.088, + "step": 4640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8596, + "step": 4641 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3521, + "step": 4642 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5089, + "step": 4643 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8831, + "step": 4644 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3512, + "step": 4645 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0645, + "step": 4646 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2109, + "step": 4647 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.69, + "step": 4648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9733, + "step": 4649 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3857, + "step": 4650 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9065, + "step": 4651 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.8798, + "step": 4652 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.9107, + "step": 4653 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2948, + "step": 4654 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3601, + "step": 4655 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8386, + "step": 4656 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.4677, + "step": 4657 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.9595, + "step": 4658 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.901, + "step": 4659 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7767, + "step": 4660 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.7727, + "step": 4661 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1065, + "step": 4662 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.663, + "step": 4663 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6291, + "step": 4664 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.2727, + "step": 4665 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8332, + "step": 4666 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5858, + "step": 4667 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.6918, + "step": 4668 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4193, + "step": 4669 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.236, + "step": 4670 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1981, + "step": 4671 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8736, + "step": 4672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9571, + "step": 4673 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5301, + "step": 4674 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5035, + "step": 4675 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3201, + "step": 4676 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2301, + "step": 4677 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5596, + "step": 4678 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1596, + "step": 4679 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6705, + "step": 4680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3611, + "step": 4681 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.2311, + "step": 4682 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1461, + "step": 4683 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8506, + "step": 4684 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1727, + "step": 4685 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.8938, + "step": 4686 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.1876, + "step": 4687 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4972, + "step": 4688 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5906, + "step": 4689 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6744, + "step": 4690 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4496, + "step": 4691 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.9415, + "step": 4692 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3577, + "step": 4693 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.672, + "step": 4694 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5867, + "step": 4695 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.4507, + "step": 4696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.015, + "step": 4697 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.532, + "step": 4698 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.5171, + "step": 4699 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0666, + "step": 4700 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.8359, + "step": 4701 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.6945, + "step": 4702 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.4712, + "step": 4703 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2963, + "step": 4704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4449, + "step": 4705 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.08, + "step": 4706 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.8184, + "step": 4707 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8886, + "step": 4708 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.4294, + "step": 4709 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6778, + "step": 4710 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9429, + "step": 4711 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.185, + "step": 4712 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8384, + "step": 4713 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7196, + "step": 4714 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.8159, + "step": 4715 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9228, + "step": 4716 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.922, + "step": 4717 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2769, + "step": 4718 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4102, + "step": 4719 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3473, + "step": 4720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.7315, + "step": 4721 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5719, + "step": 4722 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.45, + "step": 4723 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7166, + "step": 4724 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.003, + "step": 4725 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.6474, + "step": 4726 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.0281, + "step": 4727 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1768, + "step": 4728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3749, + "step": 4729 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5307, + "step": 4730 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9728, + "step": 4731 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.7612, + "step": 4732 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.0616, + "step": 4733 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.7188, + "step": 4734 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.9895, + "step": 4735 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 9.2942, + "step": 4736 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.6735, + "step": 4737 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.1246, + "step": 4738 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5913, + "step": 4739 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7382, + "step": 4740 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.629, + "step": 4741 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.1264, + "step": 4742 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.5582, + "step": 4743 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9757, + "step": 4744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.4638, + "step": 4745 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9211, + "step": 4746 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.5218, + "step": 4747 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3471, + "step": 4748 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4113, + "step": 4749 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.4051, + "step": 4750 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.8378, + "step": 4751 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.278, + "step": 4752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2717, + "step": 4753 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7359, + "step": 4754 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4031, + "step": 4755 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.6597, + "step": 4756 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.394, + "step": 4757 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3462, + "step": 4758 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4558, + "step": 4759 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1341, + "step": 4760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.6349, + "step": 4761 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.3346, + "step": 4762 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.188, + "step": 4763 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.0209, + "step": 4764 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.4156, + "step": 4765 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.3871, + "step": 4766 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.7964, + "step": 4767 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.2041, + "step": 4768 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3015, + "step": 4769 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.1091, + "step": 4770 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2722, + "step": 4771 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.0096, + "step": 4772 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.2771, + "step": 4773 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.5131, + "step": 4774 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.8754, + "step": 4775 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 8.0951, + "step": 4776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5389, + "step": 4777 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.6913, + "step": 4778 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3431, + "step": 4779 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.3538, + "step": 4780 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.4533, + "step": 4781 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.6601, + "step": 4782 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.9653, + "step": 4783 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0344, + "step": 4784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.0719, + "step": 4785 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 3.3465, + "step": 4786 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.5308, + "step": 4787 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.9054, + "step": 4788 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.2575, + "step": 4789 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.905, + "step": 4790 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.7652, + "step": 4791 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 2.3971, + "step": 4792 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 7.9916, + "step": 4793 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.3688, + "step": 4794 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.9891, + "step": 4795 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.7919, + "step": 4796 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 6.2468, + "step": 4797 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.1588, + "step": 4798 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 4.0453, + "step": 4799 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004, + "loss": 5.7421, + "step": 4800 + }, + { + "epoch": 0.04, + "eval_loss": 6.6897292137146, + "eval_runtime": 22.2665, + "eval_samples_per_second": 2.246, + "eval_steps_per_second": 1.123, + "step": 4800 + }, + { + "epoch": 0.04, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 4.275100479125976, + "step": 4800 + }, + { + "epoch": 0.04, + "step": 4800, + "total_flos": 7.930864121570918e+16, + "train_loss": 1.2399261393149694, + "train_runtime": 2886.6567, + "train_samples_per_second": 10.393, + "train_steps_per_second": 10.393 } ], "max_steps": 30000, "num_train_epochs": 1, - "total_flos": 6.198225528943411e+16, + "total_flos": 7.930864121570918e+16, "trial_name": null, "trial_params": null } diff --git a/training_args.bin b/training_args.bin index a8f7d1b91987fbda198ccfdfec732ab7330c2c7a..cdd9dec0839dd55328a8e59cbb2bcf7c0315c309 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:accdb0dacbe8f49c6833042066e37320e33fafca5f5aad521823e7572752725d +oid sha256:bd2e97ea6d2b8e9c0ed7efbb032ce79458292aa99ced1bbeb7b777b9663a324f size 6011