diff --git a/adapter_model.bin b/adapter_model.bin
index 17a5d9a4024f623f507a7c923ee385b59403ab9b..aa38a64bfc3a8cb8c55c290beaf3783f62c8da4e 100644
--- a/adapter_model.bin
+++ b/adapter_model.bin
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:18f11fbc4708b106870eec7154c2b9bbcad7ba5b185b2bacd1b7a7c4926deed7
+oid sha256:23824721820d37d6fe44fee9306d0e71a5826aebaf3eb2f970cab6872288b55a
size 871609293
diff --git a/checkpoint-1000/adapter_model.bin b/checkpoint-1000/adapter_model.bin
deleted file mode 100644
index c702b294b059ee5e9af390cf41dd54c59db22719..0000000000000000000000000000000000000000
--- a/checkpoint-1000/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:387e723c7a914bffb7e4d1c12aa1a008f5cc8f094792f9319a8cd7266c47ae6b
-size 871609293
diff --git a/checkpoint-1000/adapter_model/adapter_model/README.md b/checkpoint-1000/adapter_model/adapter_model/README.md
deleted file mode 100644
index b2a9ac08c477a18d16ef75ee89b21cee91a6169a..0000000000000000000000000000000000000000
--- a/checkpoint-1000/adapter_model/adapter_model/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-- PEFT 0.4.0
-- PEFT 0.4.0
-
-- PEFT 0.4.0
diff --git a/checkpoint-1000/adapter_model/adapter_model/adapter_model.bin b/checkpoint-1000/adapter_model/adapter_model/adapter_model.bin
deleted file mode 100644
index ba3f0b56e75d88ed0a54d7e5d2e9b0dbb3953c67..0000000000000000000000000000000000000000
--- a/checkpoint-1000/adapter_model/adapter_model/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:80118c725477dea393f7c5d033e93b59658969b94a87d00f4bf43d4221785903
-size 871609293
diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt
deleted file mode 100644
index 44dbb5257280c52b96763f555661b0a7f06f21b8..0000000000000000000000000000000000000000
--- a/checkpoint-1000/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ce3190f57432e0973d291995c12b59dd5194b3d8079b426d06cba01005b0c39f
-size 873873439
diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth
deleted file mode 100644
index a60d6abfaca5a5d219221a27da6ab5214adfa458..0000000000000000000000000000000000000000
--- a/checkpoint-1000/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eea4d2d84d0669c268f9b7375a0008cb8ca8f8c06d8427bb52a1d43b533afae1
-size 14511
diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt
deleted file mode 100644
index e7862dc88d747c6be38c8fa597926c62ef32e4f0..0000000000000000000000000000000000000000
--- a/checkpoint-1000/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:128a4564407f8cbd339459b6c4e630aaf9720453458a1460dee16f26280f4cf7
-size 627
diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json
deleted file mode 100644
index 769b3743676c0ba2b09fbca9f1f1f50b0c8089df..0000000000000000000000000000000000000000
--- a/checkpoint-1000/trainer_state.json
+++ /dev/null
@@ -1,6106 +0,0 @@
-{
- "best_metric": 6.766034126281738,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-1000",
- "epoch": 0.007638835841417768,
- "global_step": 1000,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3249,
- "step": 201
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.352,
- "step": 202
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2984,
- "step": 203
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.2734,
- "step": 204
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1,
- "step": 205
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.448,
- "step": 206
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2387,
- "step": 207
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.861,
- "step": 208
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.603,
- "step": 209
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.29,
- "step": 210
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2105,
- "step": 211
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1949,
- "step": 212
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0538,
- "step": 213
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0343,
- "step": 214
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7794,
- "step": 215
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5532,
- "step": 216
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2676,
- "step": 217
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 218
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0432,
- "step": 219
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9391,
- "step": 220
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.724,
- "step": 221
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.229,
- "step": 222
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3462,
- "step": 223
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0752,
- "step": 224
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1966,
- "step": 225
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7279,
- "step": 226
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8484,
- "step": 227
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7291,
- "step": 228
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2665,
- "step": 229
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3551,
- "step": 230
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7338,
- "step": 231
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8407,
- "step": 232
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3581,
- "step": 233
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.441,
- "step": 234
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0788,
- "step": 235
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8404,
- "step": 236
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4314,
- "step": 237
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 238
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0205,
- "step": 239
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4162,
- "step": 240
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7515,
- "step": 241
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1442,
- "step": 242
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5868,
- "step": 243
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6514,
- "step": 244
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2683,
- "step": 245
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.31,
- "step": 246
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0161,
- "step": 247
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.484,
- "step": 248
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9726,
- "step": 249
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0926,
- "step": 250
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5279,
- "step": 251
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0017,
- "step": 252
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5684,
- "step": 253
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 254
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9489,
- "step": 255
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8948,
- "step": 256
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0856,
- "step": 257
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.599,
- "step": 258
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1575,
- "step": 259
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3701,
- "step": 260
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.464,
- "step": 261
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9193,
- "step": 262
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5679,
- "step": 263
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9424,
- "step": 264
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6689,
- "step": 265
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6475,
- "step": 266
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4311,
- "step": 267
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7426,
- "step": 268
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5191,
- "step": 269
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3059,
- "step": 270
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0142,
- "step": 271
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.4509,
- "step": 272
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0831,
- "step": 273
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6977,
- "step": 274
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4236,
- "step": 275
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2129,
- "step": 276
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1394,
- "step": 277
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.685,
- "step": 278
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0275,
- "step": 279
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.3215,
- "step": 280
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6542,
- "step": 281
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7614,
- "step": 282
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2996,
- "step": 283
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6275,
- "step": 284
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8736,
- "step": 285
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4667,
- "step": 286
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8486,
- "step": 287
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2125,
- "step": 288
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4523,
- "step": 289
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.551,
- "step": 290
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7158,
- "step": 291
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5092,
- "step": 292
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9169,
- "step": 293
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5333,
- "step": 294
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9949,
- "step": 295
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7189,
- "step": 296
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2366,
- "step": 297
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4745,
- "step": 298
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2439,
- "step": 299
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4176,
- "step": 300
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9365,
- "step": 301
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5309,
- "step": 302
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2201,
- "step": 303
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0312,
- "step": 304
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4173,
- "step": 305
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4856,
- "step": 306
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5041,
- "step": 307
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3597,
- "step": 308
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8395,
- "step": 309
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0776,
- "step": 310
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7566,
- "step": 311
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9767,
- "step": 312
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3804,
- "step": 313
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5327,
- "step": 314
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5293,
- "step": 315
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4531,
- "step": 316
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3961,
- "step": 317
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5669,
- "step": 318
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8559,
- "step": 319
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.117,
- "step": 320
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4279,
- "step": 321
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7977,
- "step": 322
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.955,
- "step": 323
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0164,
- "step": 324
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.0495,
- "step": 325
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2768,
- "step": 326
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3162,
- "step": 327
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.88,
- "step": 328
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2157,
- "step": 329
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8427,
- "step": 330
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9729,
- "step": 331
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1779,
- "step": 332
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1302,
- "step": 333
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7705,
- "step": 334
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.523,
- "step": 335
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9375,
- "step": 336
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.1409,
- "step": 337
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 338
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6481,
- "step": 339
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.933,
- "step": 340
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9179,
- "step": 341
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9332,
- "step": 342
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6553,
- "step": 343
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7412,
- "step": 344
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.849,
- "step": 345
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7321,
- "step": 346
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9717,
- "step": 347
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3465,
- "step": 348
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4535,
- "step": 349
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2376,
- "step": 350
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9025,
- "step": 351
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.916,
- "step": 352
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3785,
- "step": 353
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0576,
- "step": 354
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5081,
- "step": 355
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1303,
- "step": 356
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3854,
- "step": 357
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5553,
- "step": 358
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9627,
- "step": 359
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.402,
- "step": 360
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3484,
- "step": 361
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5428,
- "step": 362
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9128,
- "step": 363
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3934,
- "step": 364
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4812,
- "step": 365
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5395,
- "step": 366
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6304,
- "step": 367
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5626,
- "step": 368
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5693,
- "step": 369
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3458,
- "step": 370
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6254,
- "step": 371
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8706,
- "step": 372
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6076,
- "step": 373
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.2912,
- "step": 374
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3326,
- "step": 375
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3735,
- "step": 376
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4916,
- "step": 377
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5553,
- "step": 378
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6241,
- "step": 379
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6106,
- "step": 380
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.266,
- "step": 381
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7738,
- "step": 382
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4988,
- "step": 383
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2968,
- "step": 384
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8512,
- "step": 385
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0341,
- "step": 386
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.898,
- "step": 387
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.23,
- "step": 388
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9608,
- "step": 389
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.3679,
- "step": 390
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7074,
- "step": 391
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9903,
- "step": 392
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5845,
- "step": 393
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6493,
- "step": 394
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7962,
- "step": 395
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4865,
- "step": 396
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 397
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3942,
- "step": 398
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4715,
- "step": 399
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2073,
- "step": 400
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.106412410736084,
- "eval_runtime": 22.5667,
- "eval_samples_per_second": 2.216,
- "eval_steps_per_second": 1.108,
- "step": 400
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 2.9128687667846678,
- "step": 400
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3984,
- "step": 401
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7983,
- "step": 402
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8589,
- "step": 403
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9884,
- "step": 404
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4427,
- "step": 405
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0374,
- "step": 406
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7999,
- "step": 407
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2437,
- "step": 408
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6902,
- "step": 409
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.81,
- "step": 410
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8979,
- "step": 411
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0211,
- "step": 412
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3945,
- "step": 413
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5807,
- "step": 414
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1433,
- "step": 415
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9466,
- "step": 416
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6276,
- "step": 417
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4945,
- "step": 418
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6215,
- "step": 419
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.3919,
- "step": 420
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7915,
- "step": 421
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3284,
- "step": 422
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8723,
- "step": 423
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0149,
- "step": 424
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.979,
- "step": 425
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9175,
- "step": 426
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4994,
- "step": 427
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9791,
- "step": 428
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1156,
- "step": 429
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5813,
- "step": 430
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1882,
- "step": 431
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9956,
- "step": 432
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6189,
- "step": 433
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9624,
- "step": 434
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5387,
- "step": 435
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4605,
- "step": 436
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.474,
- "step": 437
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 438
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5705,
- "step": 439
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.275,
- "step": 440
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9638,
- "step": 441
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4857,
- "step": 442
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3067,
- "step": 443
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8152,
- "step": 444
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1668,
- "step": 445
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5293,
- "step": 446
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3981,
- "step": 447
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4787,
- "step": 448
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5981,
- "step": 449
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3569,
- "step": 450
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4088,
- "step": 451
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3677,
- "step": 452
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4686,
- "step": 453
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3552,
- "step": 454
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7931,
- "step": 455
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9285,
- "step": 456
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0554,
- "step": 457
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7277,
- "step": 458
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2474,
- "step": 459
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9274,
- "step": 460
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2558,
- "step": 461
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7547,
- "step": 462
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1264,
- "step": 463
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2124,
- "step": 464
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8751,
- "step": 465
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7317,
- "step": 466
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3697,
- "step": 467
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0021,
- "step": 468
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3761,
- "step": 469
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2291,
- "step": 470
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7968,
- "step": 471
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9454,
- "step": 472
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0194,
- "step": 473
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5048,
- "step": 474
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6837,
- "step": 475
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1066,
- "step": 476
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3501,
- "step": 477
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5071,
- "step": 478
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1086,
- "step": 479
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7269,
- "step": 480
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5419,
- "step": 481
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 482
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1433,
- "step": 483
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0869,
- "step": 484
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.032,
- "step": 485
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0946,
- "step": 486
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7162,
- "step": 487
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0406,
- "step": 488
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9048,
- "step": 489
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2231,
- "step": 490
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.6524,
- "step": 491
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1151,
- "step": 492
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.591,
- "step": 493
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1628,
- "step": 494
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0757,
- "step": 495
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3471,
- "step": 496
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9385,
- "step": 497
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9362,
- "step": 498
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2252,
- "step": 499
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.359,
- "step": 500
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 501
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0484,
- "step": 502
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5773,
- "step": 503
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.39,
- "step": 504
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5923,
- "step": 505
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2,
- "step": 506
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5536,
- "step": 507
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.8958,
- "step": 508
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7763,
- "step": 509
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2045,
- "step": 510
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4219,
- "step": 511
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6305,
- "step": 512
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4243,
- "step": 513
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7842,
- "step": 514
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8769,
- "step": 515
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8903,
- "step": 516
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0489,
- "step": 517
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1314,
- "step": 518
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5973,
- "step": 519
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8022,
- "step": 520
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3539,
- "step": 521
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.222,
- "step": 522
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5403,
- "step": 523
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1323,
- "step": 524
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7813,
- "step": 525
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4982,
- "step": 526
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2426,
- "step": 527
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0142,
- "step": 528
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8996,
- "step": 529
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8671,
- "step": 530
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4139,
- "step": 531
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9478,
- "step": 532
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7062,
- "step": 533
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0098,
- "step": 534
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9195,
- "step": 535
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0255,
- "step": 536
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6291,
- "step": 537
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3245,
- "step": 538
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6382,
- "step": 539
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8076,
- "step": 540
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6725,
- "step": 541
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0563,
- "step": 542
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6178,
- "step": 543
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7974,
- "step": 544
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7535,
- "step": 545
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4948,
- "step": 546
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8941,
- "step": 547
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6496,
- "step": 548
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9084,
- "step": 549
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.65,
- "step": 550
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7673,
- "step": 551
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2221,
- "step": 552
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.14,
- "step": 553
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6747,
- "step": 554
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8009,
- "step": 555
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7307,
- "step": 556
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0143,
- "step": 557
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8098,
- "step": 558
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.026,
- "step": 559
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4572,
- "step": 560
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7913,
- "step": 561
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9962,
- "step": 562
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.767,
- "step": 563
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9497,
- "step": 564
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9626,
- "step": 565
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2536,
- "step": 566
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0421,
- "step": 567
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8177,
- "step": 568
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9241,
- "step": 569
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0162,
- "step": 570
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3368,
- "step": 571
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7515,
- "step": 572
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6389,
- "step": 573
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.662,
- "step": 574
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8097,
- "step": 575
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9346,
- "step": 576
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3154,
- "step": 577
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7724,
- "step": 578
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3685,
- "step": 579
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2775,
- "step": 580
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.106,
- "step": 581
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4733,
- "step": 582
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2334,
- "step": 583
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9478,
- "step": 584
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0013,
- "step": 585
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7242,
- "step": 586
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.922,
- "step": 587
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1418,
- "step": 588
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4472,
- "step": 589
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4785,
- "step": 590
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.783,
- "step": 591
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0706,
- "step": 592
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4136,
- "step": 593
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5969,
- "step": 594
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5157,
- "step": 595
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5658,
- "step": 596
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4647,
- "step": 597
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2028,
- "step": 598
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6913,
- "step": 599
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7239,
- "step": 600
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.012163162231445,
- "eval_runtime": 22.5807,
- "eval_samples_per_second": 2.214,
- "eval_steps_per_second": 1.107,
- "step": 600
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 4.24488224029541,
- "step": 600
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5253,
- "step": 601
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0392,
- "step": 602
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.447,
- "step": 603
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9441,
- "step": 604
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1874,
- "step": 605
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7817,
- "step": 606
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0348,
- "step": 607
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5593,
- "step": 608
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9361,
- "step": 609
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3534,
- "step": 610
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.476,
- "step": 611
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0937,
- "step": 612
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 613
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5586,
- "step": 614
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3796,
- "step": 615
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.676,
- "step": 616
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5321,
- "step": 617
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0059,
- "step": 618
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6139,
- "step": 619
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2391,
- "step": 620
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0636,
- "step": 621
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0895,
- "step": 622
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.62,
- "step": 623
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0469,
- "step": 624
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2173,
- "step": 625
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9432,
- "step": 626
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3928,
- "step": 627
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0959,
- "step": 628
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1197,
- "step": 629
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4277,
- "step": 630
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.418,
- "step": 631
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8687,
- "step": 632
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0156,
- "step": 633
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.573,
- "step": 634
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.112,
- "step": 635
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8954,
- "step": 636
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.36,
- "step": 637
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.924,
- "step": 638
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4625,
- "step": 639
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2023,
- "step": 640
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0685,
- "step": 641
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5304,
- "step": 642
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4456,
- "step": 643
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7271,
- "step": 644
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6011,
- "step": 645
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.895,
- "step": 646
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.864,
- "step": 647
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3452,
- "step": 648
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8978,
- "step": 649
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2253,
- "step": 650
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2813,
- "step": 651
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7248,
- "step": 652
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4283,
- "step": 653
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4304,
- "step": 654
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3893,
- "step": 655
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1115,
- "step": 656
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5892,
- "step": 657
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6572,
- "step": 658
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.925,
- "step": 659
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4431,
- "step": 660
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7711,
- "step": 661
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9439,
- "step": 662
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3781,
- "step": 663
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5573,
- "step": 664
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.4476,
- "step": 665
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0057,
- "step": 666
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2702,
- "step": 667
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5717,
- "step": 668
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2242,
- "step": 669
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1,
- "step": 670
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0517,
- "step": 671
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6543,
- "step": 672
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1138,
- "step": 673
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.461,
- "step": 674
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7094,
- "step": 675
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.521,
- "step": 676
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7116,
- "step": 677
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6343,
- "step": 678
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3762,
- "step": 679
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3603,
- "step": 680
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7144,
- "step": 681
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4545,
- "step": 682
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8188,
- "step": 683
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7965,
- "step": 684
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4675,
- "step": 685
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0436,
- "step": 686
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1219,
- "step": 687
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4517,
- "step": 688
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8476,
- "step": 689
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9284,
- "step": 690
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7405,
- "step": 691
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7142,
- "step": 692
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3979,
- "step": 693
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.3285,
- "step": 694
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 695
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4472,
- "step": 696
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7355,
- "step": 697
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7982,
- "step": 698
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4516,
- "step": 699
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.2532,
- "step": 700
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9959,
- "step": 701
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0418,
- "step": 702
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7767,
- "step": 703
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.774,
- "step": 704
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8912,
- "step": 705
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 706
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.6197,
- "step": 707
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4755,
- "step": 708
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8276,
- "step": 709
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2925,
- "step": 710
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3887,
- "step": 711
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1465,
- "step": 712
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5806,
- "step": 713
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3063,
- "step": 714
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6066,
- "step": 715
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1536,
- "step": 716
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5582,
- "step": 717
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0353,
- "step": 718
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6415,
- "step": 719
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8291,
- "step": 720
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7575,
- "step": 721
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9141,
- "step": 722
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5217,
- "step": 723
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4549,
- "step": 724
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8112,
- "step": 725
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2729,
- "step": 726
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8515,
- "step": 727
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9712,
- "step": 728
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.097,
- "step": 729
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0208,
- "step": 730
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1377,
- "step": 731
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4019,
- "step": 732
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9869,
- "step": 733
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2954,
- "step": 734
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4144,
- "step": 735
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8053,
- "step": 736
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8891,
- "step": 737
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.812,
- "step": 738
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2657,
- "step": 739
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3747,
- "step": 740
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0364,
- "step": 741
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8845,
- "step": 742
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.887,
- "step": 743
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0706,
- "step": 744
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6619,
- "step": 745
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2941,
- "step": 746
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9192,
- "step": 747
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9947,
- "step": 748
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6376,
- "step": 749
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0358,
- "step": 750
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4578,
- "step": 751
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7784,
- "step": 752
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.632,
- "step": 753
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8649,
- "step": 754
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7951,
- "step": 755
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3841,
- "step": 756
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4558,
- "step": 757
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7638,
- "step": 758
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9413,
- "step": 759
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0916,
- "step": 760
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1351,
- "step": 761
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6078,
- "step": 762
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7982,
- "step": 763
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6132,
- "step": 764
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.551,
- "step": 765
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3301,
- "step": 766
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4888,
- "step": 767
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1476,
- "step": 768
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4244,
- "step": 769
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6025,
- "step": 770
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.102,
- "step": 771
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.017,
- "step": 772
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4101,
- "step": 773
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1741,
- "step": 774
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1256,
- "step": 775
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5164,
- "step": 776
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6959,
- "step": 777
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7666,
- "step": 778
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4336,
- "step": 779
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 11.8478,
- "step": 780
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8382,
- "step": 781
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 782
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4424,
- "step": 783
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.345,
- "step": 784
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6887,
- "step": 785
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9867,
- "step": 786
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6152,
- "step": 787
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7283,
- "step": 788
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0157,
- "step": 789
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6044,
- "step": 790
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4132,
- "step": 791
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.735,
- "step": 792
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3631,
- "step": 793
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2308,
- "step": 794
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2184,
- "step": 795
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4661,
- "step": 796
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9637,
- "step": 797
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4178,
- "step": 798
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5909,
- "step": 799
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1482,
- "step": 800
- },
- {
- "epoch": 0.01,
- "eval_loss": 7.355834484100342,
- "eval_runtime": 22.6252,
- "eval_samples_per_second": 2.21,
- "eval_steps_per_second": 1.105,
- "step": 800
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 5.191131496429444,
- "step": 800
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.0427,
- "step": 801
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2669,
- "step": 802
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.8026,
- "step": 803
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4949,
- "step": 804
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4491,
- "step": 805
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0383,
- "step": 806
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1213,
- "step": 807
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5158,
- "step": 808
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5648,
- "step": 809
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9932,
- "step": 810
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6441,
- "step": 811
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8661,
- "step": 812
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3609,
- "step": 813
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6828,
- "step": 814
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9693,
- "step": 815
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3733,
- "step": 816
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6286,
- "step": 817
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4349,
- "step": 818
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6706,
- "step": 819
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3089,
- "step": 820
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2394,
- "step": 821
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.963,
- "step": 822
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6564,
- "step": 823
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.997,
- "step": 824
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9261,
- "step": 825
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1421,
- "step": 826
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2335,
- "step": 827
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3432,
- "step": 828
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0154,
- "step": 829
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5135,
- "step": 830
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6226,
- "step": 831
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1672,
- "step": 832
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0853,
- "step": 833
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1213,
- "step": 834
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7815,
- "step": 835
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8916,
- "step": 836
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6464,
- "step": 837
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3307,
- "step": 838
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8165,
- "step": 839
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.886,
- "step": 840
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4781,
- "step": 841
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8351,
- "step": 842
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.358,
- "step": 843
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6501,
- "step": 844
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0864,
- "step": 845
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2922,
- "step": 846
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.9847,
- "step": 847
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2558,
- "step": 848
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0195,
- "step": 849
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.996,
- "step": 850
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5705,
- "step": 851
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4136,
- "step": 852
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6302,
- "step": 853
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8761,
- "step": 854
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4995,
- "step": 855
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4762,
- "step": 856
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5749,
- "step": 857
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0273,
- "step": 858
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8258,
- "step": 859
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1836,
- "step": 860
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5493,
- "step": 861
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1891,
- "step": 862
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7392,
- "step": 863
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1655,
- "step": 864
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5218,
- "step": 865
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3759,
- "step": 866
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2497,
- "step": 867
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5901,
- "step": 868
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0624,
- "step": 869
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.2452,
- "step": 870
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 871
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0826,
- "step": 872
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2703,
- "step": 873
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9088,
- "step": 874
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 875
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2511,
- "step": 876
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4065,
- "step": 877
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.175,
- "step": 878
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8358,
- "step": 879
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3208,
- "step": 880
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2049,
- "step": 881
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8251,
- "step": 882
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4262,
- "step": 883
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2227,
- "step": 884
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1062,
- "step": 885
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9417,
- "step": 886
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3315,
- "step": 887
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0012,
- "step": 888
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6386,
- "step": 889
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0377,
- "step": 890
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6707,
- "step": 891
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4955,
- "step": 892
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7343,
- "step": 893
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8305,
- "step": 894
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7016,
- "step": 895
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7149,
- "step": 896
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 897
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.815,
- "step": 898
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6135,
- "step": 899
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8776,
- "step": 900
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.7288,
- "step": 901
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8019,
- "step": 902
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0921,
- "step": 903
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.824,
- "step": 904
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7151,
- "step": 905
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5054,
- "step": 906
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8095,
- "step": 907
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3218,
- "step": 908
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9993,
- "step": 909
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4433,
- "step": 910
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5863,
- "step": 911
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.505,
- "step": 912
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9734,
- "step": 913
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 914
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4574,
- "step": 915
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2787,
- "step": 916
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8201,
- "step": 917
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2537,
- "step": 918
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1387,
- "step": 919
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7161,
- "step": 920
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2207,
- "step": 921
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7953,
- "step": 922
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9949,
- "step": 923
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9173,
- "step": 924
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7903,
- "step": 925
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4784,
- "step": 926
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2264,
- "step": 927
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 928
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0686,
- "step": 929
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.791,
- "step": 930
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8393,
- "step": 931
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4387,
- "step": 932
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2374,
- "step": 933
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9598,
- "step": 934
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1597,
- "step": 935
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0403,
- "step": 936
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3301,
- "step": 937
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.072,
- "step": 938
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4788,
- "step": 939
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0656,
- "step": 940
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9647,
- "step": 941
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1168,
- "step": 942
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0293,
- "step": 943
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3622,
- "step": 944
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8957,
- "step": 945
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4,
- "step": 946
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6626,
- "step": 947
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8212,
- "step": 948
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8638,
- "step": 949
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6406,
- "step": 950
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7069,
- "step": 951
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1384,
- "step": 952
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.612,
- "step": 953
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7201,
- "step": 954
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3532,
- "step": 955
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1266,
- "step": 956
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6192,
- "step": 957
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.826,
- "step": 958
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9338,
- "step": 959
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4487,
- "step": 960
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.872,
- "step": 961
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8601,
- "step": 962
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7401,
- "step": 963
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5412,
- "step": 964
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2501,
- "step": 965
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6837,
- "step": 966
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6494,
- "step": 967
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.604,
- "step": 968
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.837,
- "step": 969
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3957,
- "step": 970
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3281,
- "step": 971
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8264,
- "step": 972
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6554,
- "step": 973
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5768,
- "step": 974
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4187,
- "step": 975
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8479,
- "step": 976
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9849,
- "step": 977
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6471,
- "step": 978
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8041,
- "step": 979
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8876,
- "step": 980
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6423,
- "step": 981
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5329,
- "step": 982
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2801,
- "step": 983
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1699,
- "step": 984
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6469,
- "step": 985
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6766,
- "step": 986
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7538,
- "step": 987
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9606,
- "step": 988
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0713,
- "step": 989
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4965,
- "step": 990
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3408,
- "step": 991
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4007,
- "step": 992
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8921,
- "step": 993
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8681,
- "step": 994
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.8867,
- "step": 995
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.467,
- "step": 996
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7895,
- "step": 997
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0523,
- "step": 998
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4032,
- "step": 999
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7719,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.766034126281738,
- "eval_runtime": 22.4042,
- "eval_samples_per_second": 2.232,
- "eval_steps_per_second": 1.116,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.338861379623413,
- "step": 1000
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 1.6844425298116608e+16,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-1200/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-1200/adapter_model.bin b/checkpoint-1200/adapter_model.bin
deleted file mode 100644
index 75a7b4989a08354b3c5f3ec9c50abaef09e10d27..0000000000000000000000000000000000000000
--- a/checkpoint-1200/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d68320d48a3c9cc3d4b3c8179638bb384ec4f6d74abaa93c838c13006b9ba862
-size 871609293
diff --git a/checkpoint-1200/added_tokens.json b/checkpoint-1200/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-1200/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt
deleted file mode 100644
index a5c6fdfe93ff3165419108ea2e0d5f891ffe6712..0000000000000000000000000000000000000000
--- a/checkpoint-1200/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3f9aac35de20d35eabe47b728aaf12aa82950ca0e3e387f6ae9d7994883ba471
-size 873873439
diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth
deleted file mode 100644
index 0cfe32475808da660e30b67e453b0ffb407d1126..0000000000000000000000000000000000000000
--- a/checkpoint-1200/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:44cdcbac175f6aff4ad243d928c84233d1d837142541c4dbdbd0e9ab4a9edc74
-size 14511
diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt
deleted file mode 100644
index d41281568533fa0ca511d25aaa4acec8de183814..0000000000000000000000000000000000000000
--- a/checkpoint-1200/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e303e329a219326eec4281908f48d8a4f3a6efdd9abb061adc0dc038af0e7e6
-size 627
diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-1200/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-1200/tokenizer.model b/checkpoint-1200/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-1200/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-1200/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json
deleted file mode 100644
index 31c52f21048e6a5283a8965065d924b5b9072b16..0000000000000000000000000000000000000000
--- a/checkpoint-1200/trainer_state.json
+++ /dev/null
@@ -1,7324 +0,0 @@
-{
- "best_metric": 6.766034126281738,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-1000",
- "epoch": 0.009166603009701322,
- "global_step": 1200,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3249,
- "step": 201
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.352,
- "step": 202
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2984,
- "step": 203
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.2734,
- "step": 204
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1,
- "step": 205
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.448,
- "step": 206
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2387,
- "step": 207
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.861,
- "step": 208
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.603,
- "step": 209
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.29,
- "step": 210
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2105,
- "step": 211
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1949,
- "step": 212
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0538,
- "step": 213
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0343,
- "step": 214
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7794,
- "step": 215
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5532,
- "step": 216
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2676,
- "step": 217
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 218
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0432,
- "step": 219
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9391,
- "step": 220
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.724,
- "step": 221
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.229,
- "step": 222
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3462,
- "step": 223
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0752,
- "step": 224
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1966,
- "step": 225
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7279,
- "step": 226
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8484,
- "step": 227
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7291,
- "step": 228
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2665,
- "step": 229
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3551,
- "step": 230
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7338,
- "step": 231
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8407,
- "step": 232
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3581,
- "step": 233
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.441,
- "step": 234
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0788,
- "step": 235
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8404,
- "step": 236
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4314,
- "step": 237
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 238
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0205,
- "step": 239
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4162,
- "step": 240
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7515,
- "step": 241
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1442,
- "step": 242
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5868,
- "step": 243
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6514,
- "step": 244
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2683,
- "step": 245
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.31,
- "step": 246
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0161,
- "step": 247
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.484,
- "step": 248
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9726,
- "step": 249
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0926,
- "step": 250
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5279,
- "step": 251
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0017,
- "step": 252
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5684,
- "step": 253
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 254
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9489,
- "step": 255
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8948,
- "step": 256
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0856,
- "step": 257
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.599,
- "step": 258
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1575,
- "step": 259
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3701,
- "step": 260
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.464,
- "step": 261
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9193,
- "step": 262
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5679,
- "step": 263
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9424,
- "step": 264
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6689,
- "step": 265
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6475,
- "step": 266
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4311,
- "step": 267
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7426,
- "step": 268
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5191,
- "step": 269
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3059,
- "step": 270
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0142,
- "step": 271
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.4509,
- "step": 272
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0831,
- "step": 273
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6977,
- "step": 274
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4236,
- "step": 275
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2129,
- "step": 276
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1394,
- "step": 277
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.685,
- "step": 278
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0275,
- "step": 279
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.3215,
- "step": 280
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6542,
- "step": 281
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7614,
- "step": 282
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2996,
- "step": 283
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6275,
- "step": 284
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8736,
- "step": 285
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4667,
- "step": 286
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8486,
- "step": 287
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2125,
- "step": 288
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4523,
- "step": 289
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.551,
- "step": 290
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7158,
- "step": 291
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5092,
- "step": 292
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9169,
- "step": 293
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5333,
- "step": 294
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9949,
- "step": 295
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7189,
- "step": 296
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2366,
- "step": 297
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4745,
- "step": 298
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2439,
- "step": 299
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4176,
- "step": 300
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9365,
- "step": 301
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5309,
- "step": 302
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2201,
- "step": 303
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0312,
- "step": 304
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4173,
- "step": 305
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4856,
- "step": 306
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5041,
- "step": 307
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3597,
- "step": 308
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8395,
- "step": 309
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0776,
- "step": 310
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7566,
- "step": 311
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9767,
- "step": 312
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3804,
- "step": 313
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5327,
- "step": 314
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5293,
- "step": 315
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4531,
- "step": 316
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3961,
- "step": 317
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5669,
- "step": 318
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8559,
- "step": 319
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.117,
- "step": 320
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4279,
- "step": 321
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7977,
- "step": 322
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.955,
- "step": 323
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0164,
- "step": 324
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.0495,
- "step": 325
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2768,
- "step": 326
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3162,
- "step": 327
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.88,
- "step": 328
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2157,
- "step": 329
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8427,
- "step": 330
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9729,
- "step": 331
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1779,
- "step": 332
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1302,
- "step": 333
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7705,
- "step": 334
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.523,
- "step": 335
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9375,
- "step": 336
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.1409,
- "step": 337
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 338
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6481,
- "step": 339
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.933,
- "step": 340
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9179,
- "step": 341
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9332,
- "step": 342
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6553,
- "step": 343
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7412,
- "step": 344
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.849,
- "step": 345
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7321,
- "step": 346
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9717,
- "step": 347
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3465,
- "step": 348
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4535,
- "step": 349
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2376,
- "step": 350
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9025,
- "step": 351
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.916,
- "step": 352
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3785,
- "step": 353
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0576,
- "step": 354
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5081,
- "step": 355
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1303,
- "step": 356
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3854,
- "step": 357
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5553,
- "step": 358
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9627,
- "step": 359
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.402,
- "step": 360
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3484,
- "step": 361
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5428,
- "step": 362
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9128,
- "step": 363
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3934,
- "step": 364
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4812,
- "step": 365
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5395,
- "step": 366
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6304,
- "step": 367
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5626,
- "step": 368
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5693,
- "step": 369
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3458,
- "step": 370
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6254,
- "step": 371
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8706,
- "step": 372
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6076,
- "step": 373
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.2912,
- "step": 374
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3326,
- "step": 375
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3735,
- "step": 376
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4916,
- "step": 377
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5553,
- "step": 378
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6241,
- "step": 379
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6106,
- "step": 380
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.266,
- "step": 381
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7738,
- "step": 382
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4988,
- "step": 383
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2968,
- "step": 384
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8512,
- "step": 385
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0341,
- "step": 386
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.898,
- "step": 387
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.23,
- "step": 388
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9608,
- "step": 389
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.3679,
- "step": 390
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7074,
- "step": 391
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9903,
- "step": 392
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5845,
- "step": 393
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6493,
- "step": 394
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7962,
- "step": 395
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4865,
- "step": 396
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 397
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3942,
- "step": 398
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4715,
- "step": 399
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2073,
- "step": 400
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.106412410736084,
- "eval_runtime": 22.5667,
- "eval_samples_per_second": 2.216,
- "eval_steps_per_second": 1.108,
- "step": 400
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 2.9128687667846678,
- "step": 400
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3984,
- "step": 401
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7983,
- "step": 402
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8589,
- "step": 403
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9884,
- "step": 404
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4427,
- "step": 405
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0374,
- "step": 406
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7999,
- "step": 407
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2437,
- "step": 408
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6902,
- "step": 409
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.81,
- "step": 410
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8979,
- "step": 411
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0211,
- "step": 412
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3945,
- "step": 413
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5807,
- "step": 414
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1433,
- "step": 415
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9466,
- "step": 416
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6276,
- "step": 417
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4945,
- "step": 418
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6215,
- "step": 419
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.3919,
- "step": 420
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7915,
- "step": 421
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3284,
- "step": 422
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8723,
- "step": 423
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0149,
- "step": 424
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.979,
- "step": 425
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9175,
- "step": 426
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4994,
- "step": 427
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9791,
- "step": 428
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1156,
- "step": 429
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5813,
- "step": 430
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1882,
- "step": 431
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9956,
- "step": 432
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6189,
- "step": 433
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9624,
- "step": 434
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5387,
- "step": 435
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4605,
- "step": 436
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.474,
- "step": 437
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 438
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5705,
- "step": 439
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.275,
- "step": 440
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9638,
- "step": 441
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4857,
- "step": 442
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3067,
- "step": 443
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8152,
- "step": 444
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1668,
- "step": 445
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5293,
- "step": 446
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3981,
- "step": 447
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4787,
- "step": 448
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5981,
- "step": 449
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3569,
- "step": 450
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4088,
- "step": 451
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3677,
- "step": 452
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4686,
- "step": 453
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3552,
- "step": 454
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7931,
- "step": 455
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9285,
- "step": 456
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0554,
- "step": 457
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7277,
- "step": 458
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2474,
- "step": 459
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9274,
- "step": 460
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2558,
- "step": 461
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7547,
- "step": 462
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1264,
- "step": 463
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2124,
- "step": 464
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8751,
- "step": 465
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7317,
- "step": 466
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3697,
- "step": 467
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0021,
- "step": 468
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3761,
- "step": 469
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2291,
- "step": 470
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7968,
- "step": 471
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9454,
- "step": 472
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0194,
- "step": 473
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5048,
- "step": 474
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6837,
- "step": 475
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1066,
- "step": 476
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3501,
- "step": 477
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5071,
- "step": 478
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1086,
- "step": 479
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7269,
- "step": 480
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5419,
- "step": 481
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 482
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1433,
- "step": 483
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0869,
- "step": 484
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.032,
- "step": 485
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0946,
- "step": 486
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7162,
- "step": 487
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0406,
- "step": 488
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9048,
- "step": 489
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2231,
- "step": 490
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.6524,
- "step": 491
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1151,
- "step": 492
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.591,
- "step": 493
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1628,
- "step": 494
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0757,
- "step": 495
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3471,
- "step": 496
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9385,
- "step": 497
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9362,
- "step": 498
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2252,
- "step": 499
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.359,
- "step": 500
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 501
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0484,
- "step": 502
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5773,
- "step": 503
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.39,
- "step": 504
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5923,
- "step": 505
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2,
- "step": 506
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5536,
- "step": 507
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.8958,
- "step": 508
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7763,
- "step": 509
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2045,
- "step": 510
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4219,
- "step": 511
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6305,
- "step": 512
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4243,
- "step": 513
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7842,
- "step": 514
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8769,
- "step": 515
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8903,
- "step": 516
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0489,
- "step": 517
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1314,
- "step": 518
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5973,
- "step": 519
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8022,
- "step": 520
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3539,
- "step": 521
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.222,
- "step": 522
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5403,
- "step": 523
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1323,
- "step": 524
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7813,
- "step": 525
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4982,
- "step": 526
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2426,
- "step": 527
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0142,
- "step": 528
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8996,
- "step": 529
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8671,
- "step": 530
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4139,
- "step": 531
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9478,
- "step": 532
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7062,
- "step": 533
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0098,
- "step": 534
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9195,
- "step": 535
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0255,
- "step": 536
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6291,
- "step": 537
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3245,
- "step": 538
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6382,
- "step": 539
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8076,
- "step": 540
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6725,
- "step": 541
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0563,
- "step": 542
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6178,
- "step": 543
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7974,
- "step": 544
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7535,
- "step": 545
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4948,
- "step": 546
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8941,
- "step": 547
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6496,
- "step": 548
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9084,
- "step": 549
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.65,
- "step": 550
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7673,
- "step": 551
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2221,
- "step": 552
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.14,
- "step": 553
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6747,
- "step": 554
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8009,
- "step": 555
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7307,
- "step": 556
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0143,
- "step": 557
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8098,
- "step": 558
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.026,
- "step": 559
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4572,
- "step": 560
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7913,
- "step": 561
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9962,
- "step": 562
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.767,
- "step": 563
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9497,
- "step": 564
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9626,
- "step": 565
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2536,
- "step": 566
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0421,
- "step": 567
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8177,
- "step": 568
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9241,
- "step": 569
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0162,
- "step": 570
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3368,
- "step": 571
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7515,
- "step": 572
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6389,
- "step": 573
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.662,
- "step": 574
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8097,
- "step": 575
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9346,
- "step": 576
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3154,
- "step": 577
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7724,
- "step": 578
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3685,
- "step": 579
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2775,
- "step": 580
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.106,
- "step": 581
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4733,
- "step": 582
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2334,
- "step": 583
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9478,
- "step": 584
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0013,
- "step": 585
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7242,
- "step": 586
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.922,
- "step": 587
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1418,
- "step": 588
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4472,
- "step": 589
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4785,
- "step": 590
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.783,
- "step": 591
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0706,
- "step": 592
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4136,
- "step": 593
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5969,
- "step": 594
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5157,
- "step": 595
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5658,
- "step": 596
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4647,
- "step": 597
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2028,
- "step": 598
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6913,
- "step": 599
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7239,
- "step": 600
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.012163162231445,
- "eval_runtime": 22.5807,
- "eval_samples_per_second": 2.214,
- "eval_steps_per_second": 1.107,
- "step": 600
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 4.24488224029541,
- "step": 600
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5253,
- "step": 601
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0392,
- "step": 602
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.447,
- "step": 603
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9441,
- "step": 604
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1874,
- "step": 605
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7817,
- "step": 606
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0348,
- "step": 607
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5593,
- "step": 608
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9361,
- "step": 609
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3534,
- "step": 610
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.476,
- "step": 611
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0937,
- "step": 612
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 613
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5586,
- "step": 614
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3796,
- "step": 615
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.676,
- "step": 616
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5321,
- "step": 617
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0059,
- "step": 618
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6139,
- "step": 619
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2391,
- "step": 620
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0636,
- "step": 621
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0895,
- "step": 622
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.62,
- "step": 623
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0469,
- "step": 624
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2173,
- "step": 625
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9432,
- "step": 626
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3928,
- "step": 627
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0959,
- "step": 628
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1197,
- "step": 629
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4277,
- "step": 630
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.418,
- "step": 631
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8687,
- "step": 632
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0156,
- "step": 633
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.573,
- "step": 634
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.112,
- "step": 635
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8954,
- "step": 636
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.36,
- "step": 637
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.924,
- "step": 638
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4625,
- "step": 639
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2023,
- "step": 640
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0685,
- "step": 641
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5304,
- "step": 642
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4456,
- "step": 643
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7271,
- "step": 644
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6011,
- "step": 645
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.895,
- "step": 646
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.864,
- "step": 647
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3452,
- "step": 648
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8978,
- "step": 649
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2253,
- "step": 650
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2813,
- "step": 651
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7248,
- "step": 652
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4283,
- "step": 653
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4304,
- "step": 654
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3893,
- "step": 655
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1115,
- "step": 656
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5892,
- "step": 657
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6572,
- "step": 658
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.925,
- "step": 659
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4431,
- "step": 660
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7711,
- "step": 661
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9439,
- "step": 662
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3781,
- "step": 663
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5573,
- "step": 664
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.4476,
- "step": 665
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0057,
- "step": 666
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2702,
- "step": 667
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5717,
- "step": 668
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2242,
- "step": 669
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1,
- "step": 670
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0517,
- "step": 671
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6543,
- "step": 672
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1138,
- "step": 673
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.461,
- "step": 674
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7094,
- "step": 675
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.521,
- "step": 676
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7116,
- "step": 677
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6343,
- "step": 678
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3762,
- "step": 679
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3603,
- "step": 680
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7144,
- "step": 681
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4545,
- "step": 682
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8188,
- "step": 683
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7965,
- "step": 684
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4675,
- "step": 685
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0436,
- "step": 686
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1219,
- "step": 687
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4517,
- "step": 688
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8476,
- "step": 689
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9284,
- "step": 690
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7405,
- "step": 691
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7142,
- "step": 692
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3979,
- "step": 693
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.3285,
- "step": 694
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 695
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4472,
- "step": 696
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7355,
- "step": 697
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7982,
- "step": 698
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4516,
- "step": 699
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.2532,
- "step": 700
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9959,
- "step": 701
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0418,
- "step": 702
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7767,
- "step": 703
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.774,
- "step": 704
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8912,
- "step": 705
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 706
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.6197,
- "step": 707
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4755,
- "step": 708
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8276,
- "step": 709
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2925,
- "step": 710
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3887,
- "step": 711
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1465,
- "step": 712
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5806,
- "step": 713
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3063,
- "step": 714
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6066,
- "step": 715
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1536,
- "step": 716
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5582,
- "step": 717
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0353,
- "step": 718
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6415,
- "step": 719
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8291,
- "step": 720
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7575,
- "step": 721
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9141,
- "step": 722
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5217,
- "step": 723
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4549,
- "step": 724
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8112,
- "step": 725
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2729,
- "step": 726
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8515,
- "step": 727
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9712,
- "step": 728
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.097,
- "step": 729
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0208,
- "step": 730
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1377,
- "step": 731
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4019,
- "step": 732
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9869,
- "step": 733
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2954,
- "step": 734
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4144,
- "step": 735
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8053,
- "step": 736
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8891,
- "step": 737
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.812,
- "step": 738
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2657,
- "step": 739
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3747,
- "step": 740
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0364,
- "step": 741
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8845,
- "step": 742
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.887,
- "step": 743
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0706,
- "step": 744
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6619,
- "step": 745
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2941,
- "step": 746
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9192,
- "step": 747
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9947,
- "step": 748
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6376,
- "step": 749
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0358,
- "step": 750
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4578,
- "step": 751
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7784,
- "step": 752
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.632,
- "step": 753
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8649,
- "step": 754
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7951,
- "step": 755
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3841,
- "step": 756
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4558,
- "step": 757
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7638,
- "step": 758
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9413,
- "step": 759
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0916,
- "step": 760
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1351,
- "step": 761
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6078,
- "step": 762
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7982,
- "step": 763
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6132,
- "step": 764
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.551,
- "step": 765
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3301,
- "step": 766
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4888,
- "step": 767
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1476,
- "step": 768
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4244,
- "step": 769
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6025,
- "step": 770
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.102,
- "step": 771
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.017,
- "step": 772
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4101,
- "step": 773
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1741,
- "step": 774
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1256,
- "step": 775
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5164,
- "step": 776
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6959,
- "step": 777
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7666,
- "step": 778
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4336,
- "step": 779
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 11.8478,
- "step": 780
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8382,
- "step": 781
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 782
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4424,
- "step": 783
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.345,
- "step": 784
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6887,
- "step": 785
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9867,
- "step": 786
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6152,
- "step": 787
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7283,
- "step": 788
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0157,
- "step": 789
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6044,
- "step": 790
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4132,
- "step": 791
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.735,
- "step": 792
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3631,
- "step": 793
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2308,
- "step": 794
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2184,
- "step": 795
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4661,
- "step": 796
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9637,
- "step": 797
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4178,
- "step": 798
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5909,
- "step": 799
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1482,
- "step": 800
- },
- {
- "epoch": 0.01,
- "eval_loss": 7.355834484100342,
- "eval_runtime": 22.6252,
- "eval_samples_per_second": 2.21,
- "eval_steps_per_second": 1.105,
- "step": 800
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 5.191131496429444,
- "step": 800
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.0427,
- "step": 801
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2669,
- "step": 802
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.8026,
- "step": 803
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4949,
- "step": 804
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4491,
- "step": 805
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0383,
- "step": 806
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1213,
- "step": 807
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5158,
- "step": 808
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5648,
- "step": 809
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9932,
- "step": 810
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6441,
- "step": 811
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8661,
- "step": 812
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3609,
- "step": 813
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6828,
- "step": 814
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9693,
- "step": 815
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3733,
- "step": 816
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6286,
- "step": 817
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4349,
- "step": 818
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6706,
- "step": 819
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3089,
- "step": 820
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2394,
- "step": 821
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.963,
- "step": 822
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6564,
- "step": 823
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.997,
- "step": 824
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9261,
- "step": 825
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1421,
- "step": 826
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2335,
- "step": 827
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3432,
- "step": 828
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0154,
- "step": 829
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5135,
- "step": 830
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6226,
- "step": 831
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1672,
- "step": 832
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0853,
- "step": 833
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1213,
- "step": 834
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7815,
- "step": 835
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8916,
- "step": 836
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6464,
- "step": 837
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3307,
- "step": 838
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8165,
- "step": 839
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.886,
- "step": 840
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4781,
- "step": 841
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8351,
- "step": 842
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.358,
- "step": 843
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6501,
- "step": 844
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0864,
- "step": 845
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2922,
- "step": 846
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.9847,
- "step": 847
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2558,
- "step": 848
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0195,
- "step": 849
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.996,
- "step": 850
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5705,
- "step": 851
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4136,
- "step": 852
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6302,
- "step": 853
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8761,
- "step": 854
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4995,
- "step": 855
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4762,
- "step": 856
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5749,
- "step": 857
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0273,
- "step": 858
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8258,
- "step": 859
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1836,
- "step": 860
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5493,
- "step": 861
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1891,
- "step": 862
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7392,
- "step": 863
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1655,
- "step": 864
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5218,
- "step": 865
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3759,
- "step": 866
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2497,
- "step": 867
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5901,
- "step": 868
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0624,
- "step": 869
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.2452,
- "step": 870
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 871
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0826,
- "step": 872
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2703,
- "step": 873
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9088,
- "step": 874
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 875
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2511,
- "step": 876
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4065,
- "step": 877
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.175,
- "step": 878
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8358,
- "step": 879
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3208,
- "step": 880
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2049,
- "step": 881
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8251,
- "step": 882
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4262,
- "step": 883
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2227,
- "step": 884
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1062,
- "step": 885
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9417,
- "step": 886
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3315,
- "step": 887
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0012,
- "step": 888
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6386,
- "step": 889
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0377,
- "step": 890
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6707,
- "step": 891
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4955,
- "step": 892
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7343,
- "step": 893
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8305,
- "step": 894
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7016,
- "step": 895
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7149,
- "step": 896
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 897
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.815,
- "step": 898
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6135,
- "step": 899
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8776,
- "step": 900
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.7288,
- "step": 901
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8019,
- "step": 902
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0921,
- "step": 903
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.824,
- "step": 904
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7151,
- "step": 905
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5054,
- "step": 906
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8095,
- "step": 907
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3218,
- "step": 908
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9993,
- "step": 909
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4433,
- "step": 910
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5863,
- "step": 911
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.505,
- "step": 912
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9734,
- "step": 913
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 914
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4574,
- "step": 915
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2787,
- "step": 916
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8201,
- "step": 917
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2537,
- "step": 918
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1387,
- "step": 919
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7161,
- "step": 920
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2207,
- "step": 921
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7953,
- "step": 922
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9949,
- "step": 923
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9173,
- "step": 924
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7903,
- "step": 925
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4784,
- "step": 926
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2264,
- "step": 927
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 928
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0686,
- "step": 929
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.791,
- "step": 930
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8393,
- "step": 931
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4387,
- "step": 932
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2374,
- "step": 933
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9598,
- "step": 934
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1597,
- "step": 935
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0403,
- "step": 936
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3301,
- "step": 937
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.072,
- "step": 938
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4788,
- "step": 939
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0656,
- "step": 940
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9647,
- "step": 941
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1168,
- "step": 942
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0293,
- "step": 943
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3622,
- "step": 944
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8957,
- "step": 945
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4,
- "step": 946
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6626,
- "step": 947
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8212,
- "step": 948
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8638,
- "step": 949
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6406,
- "step": 950
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7069,
- "step": 951
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1384,
- "step": 952
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.612,
- "step": 953
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7201,
- "step": 954
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3532,
- "step": 955
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1266,
- "step": 956
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6192,
- "step": 957
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.826,
- "step": 958
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9338,
- "step": 959
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4487,
- "step": 960
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.872,
- "step": 961
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8601,
- "step": 962
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7401,
- "step": 963
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5412,
- "step": 964
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2501,
- "step": 965
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6837,
- "step": 966
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6494,
- "step": 967
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.604,
- "step": 968
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.837,
- "step": 969
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3957,
- "step": 970
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3281,
- "step": 971
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8264,
- "step": 972
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6554,
- "step": 973
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5768,
- "step": 974
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4187,
- "step": 975
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8479,
- "step": 976
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9849,
- "step": 977
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6471,
- "step": 978
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8041,
- "step": 979
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8876,
- "step": 980
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6423,
- "step": 981
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5329,
- "step": 982
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2801,
- "step": 983
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1699,
- "step": 984
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6469,
- "step": 985
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6766,
- "step": 986
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7538,
- "step": 987
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9606,
- "step": 988
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0713,
- "step": 989
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4965,
- "step": 990
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3408,
- "step": 991
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4007,
- "step": 992
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8921,
- "step": 993
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8681,
- "step": 994
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.8867,
- "step": 995
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.467,
- "step": 996
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7895,
- "step": 997
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0523,
- "step": 998
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4032,
- "step": 999
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7719,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.766034126281738,
- "eval_runtime": 22.4042,
- "eval_samples_per_second": 2.232,
- "eval_steps_per_second": 1.116,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.338861379623413,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0285,
- "step": 1001
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4571,
- "step": 1002
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7721,
- "step": 1003
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5108,
- "step": 1004
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3813,
- "step": 1005
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.7963,
- "step": 1006
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1101,
- "step": 1007
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.021,
- "step": 1008
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5916,
- "step": 1009
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8813,
- "step": 1010
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1193,
- "step": 1011
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5092,
- "step": 1012
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8569,
- "step": 1013
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.119,
- "step": 1014
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3247,
- "step": 1015
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2358,
- "step": 1016
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2795,
- "step": 1017
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3466,
- "step": 1018
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5443,
- "step": 1019
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7296,
- "step": 1020
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0412,
- "step": 1021
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4829,
- "step": 1022
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7901,
- "step": 1023
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8077,
- "step": 1024
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4887,
- "step": 1025
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3095,
- "step": 1026
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3235,
- "step": 1027
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6315,
- "step": 1028
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4294,
- "step": 1029
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8457,
- "step": 1030
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7583,
- "step": 1031
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3129,
- "step": 1032
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1832,
- "step": 1033
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1764,
- "step": 1034
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0101,
- "step": 1035
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6524,
- "step": 1036
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2825,
- "step": 1037
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2262,
- "step": 1038
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2533,
- "step": 1039
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8794,
- "step": 1040
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7901,
- "step": 1041
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8351,
- "step": 1042
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5888,
- "step": 1043
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8932,
- "step": 1044
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2999,
- "step": 1045
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8396,
- "step": 1046
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4209,
- "step": 1047
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1524,
- "step": 1048
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7784,
- "step": 1049
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0179,
- "step": 1050
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1153,
- "step": 1051
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2149,
- "step": 1052
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0117,
- "step": 1053
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9693,
- "step": 1054
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5656,
- "step": 1055
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5,
- "step": 1056
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.102,
- "step": 1057
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3079,
- "step": 1058
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5754,
- "step": 1059
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6989,
- "step": 1060
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9597,
- "step": 1061
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3743,
- "step": 1062
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8887,
- "step": 1063
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3779,
- "step": 1064
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5001,
- "step": 1065
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4095,
- "step": 1066
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5073,
- "step": 1067
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1331,
- "step": 1068
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.323,
- "step": 1069
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6116,
- "step": 1070
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1212,
- "step": 1071
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0951,
- "step": 1072
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2463,
- "step": 1073
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4488,
- "step": 1074
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.279,
- "step": 1075
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5728,
- "step": 1076
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1362,
- "step": 1077
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6648,
- "step": 1078
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.427,
- "step": 1079
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8145,
- "step": 1080
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5308,
- "step": 1081
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.974,
- "step": 1082
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1965,
- "step": 1083
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8749,
- "step": 1084
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7352,
- "step": 1085
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7934,
- "step": 1086
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6003,
- "step": 1087
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5775,
- "step": 1088
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.519,
- "step": 1089
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7403,
- "step": 1090
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8145,
- "step": 1091
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5776,
- "step": 1092
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3753,
- "step": 1093
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9586,
- "step": 1094
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7263,
- "step": 1095
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7034,
- "step": 1096
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0579,
- "step": 1097
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8419,
- "step": 1098
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0751,
- "step": 1099
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6438,
- "step": 1100
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8744,
- "step": 1101
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4992,
- "step": 1102
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8094,
- "step": 1103
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.162,
- "step": 1104
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8351,
- "step": 1105
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8845,
- "step": 1106
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1894,
- "step": 1107
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.8333,
- "step": 1108
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4226,
- "step": 1109
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0712,
- "step": 1110
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9981,
- "step": 1111
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5885,
- "step": 1112
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1915,
- "step": 1113
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8003,
- "step": 1114
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5566,
- "step": 1115
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4085,
- "step": 1116
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0793,
- "step": 1117
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0909,
- "step": 1118
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2273,
- "step": 1119
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8273,
- "step": 1120
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0231,
- "step": 1121
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7229,
- "step": 1122
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4479,
- "step": 1123
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 1124
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9038,
- "step": 1125
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2653,
- "step": 1126
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 1127
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3003,
- "step": 1128
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7853,
- "step": 1129
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9143,
- "step": 1130
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2573,
- "step": 1131
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7091,
- "step": 1132
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3372,
- "step": 1133
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4165,
- "step": 1134
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4422,
- "step": 1135
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7693,
- "step": 1136
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7802,
- "step": 1137
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7263,
- "step": 1138
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6749,
- "step": 1139
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9459,
- "step": 1140
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9697,
- "step": 1141
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4506,
- "step": 1142
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5099,
- "step": 1143
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1475,
- "step": 1144
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3769,
- "step": 1145
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2035,
- "step": 1146
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6017,
- "step": 1147
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.463,
- "step": 1148
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3844,
- "step": 1149
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5306,
- "step": 1150
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5502,
- "step": 1151
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7394,
- "step": 1152
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5626,
- "step": 1153
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1618,
- "step": 1154
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5174,
- "step": 1155
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1038,
- "step": 1156
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3789,
- "step": 1157
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2985,
- "step": 1158
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4763,
- "step": 1159
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5071,
- "step": 1160
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0827,
- "step": 1161
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7349,
- "step": 1162
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.798,
- "step": 1163
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3176,
- "step": 1164
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8114,
- "step": 1165
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3379,
- "step": 1166
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1157,
- "step": 1167
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4675,
- "step": 1168
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2721,
- "step": 1169
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0603,
- "step": 1170
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6358,
- "step": 1171
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0865,
- "step": 1172
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.709,
- "step": 1173
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7705,
- "step": 1174
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7677,
- "step": 1175
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2418,
- "step": 1176
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7114,
- "step": 1177
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1165,
- "step": 1178
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9654,
- "step": 1179
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0672,
- "step": 1180
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1738,
- "step": 1181
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7604,
- "step": 1182
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 1183
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0231,
- "step": 1184
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2938,
- "step": 1185
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.783,
- "step": 1186
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3328,
- "step": 1187
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.321,
- "step": 1188
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6368,
- "step": 1189
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.101,
- "step": 1190
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6777,
- "step": 1191
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0831,
- "step": 1192
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5853,
- "step": 1193
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7923,
- "step": 1194
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3734,
- "step": 1195
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4268,
- "step": 1196
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6796,
- "step": 1197
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9028,
- "step": 1198
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3716,
- "step": 1199
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6761,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.9188361167907715,
- "eval_runtime": 22.426,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 3.3686839294433595,
- "step": 1200
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 2.046385511350272e+16,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-1200/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011
diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-1400/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-1400/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-1400/adapter_model.bin b/checkpoint-1400/adapter_model.bin
deleted file mode 100644
index ba3f0b56e75d88ed0a54d7e5d2e9b0dbb3953c67..0000000000000000000000000000000000000000
--- a/checkpoint-1400/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:80118c725477dea393f7c5d033e93b59658969b94a87d00f4bf43d4221785903
-size 871609293
diff --git a/checkpoint-1400/added_tokens.json b/checkpoint-1400/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-1400/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt
deleted file mode 100644
index 1d963272d65d802cca0e4d5cb7f32ee64edebdb8..0000000000000000000000000000000000000000
--- a/checkpoint-1400/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a8ea3d806921abca9924806d7f6abd96f8dea29d950b57b47e1e75777d7fb3a8
-size 873873439
diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth
deleted file mode 100644
index 7e428c53bf9959233f3d69d8fd6ced69eb82d4d9..0000000000000000000000000000000000000000
--- a/checkpoint-1400/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:15d355942b9dbfbf3de49497b3988f81d4457408fe99ee8460375e60e095e3c7
-size 14511
diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt
deleted file mode 100644
index 1cd1dc46688c1bafdeee595dca33df6ed135d924..0000000000000000000000000000000000000000
--- a/checkpoint-1400/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4f07b5980373715ebdb21e0ee5240bdc9b0757226ab8995fd5ad862d2e1e5e0b
-size 627
diff --git a/checkpoint-1400/special_tokens_map.json b/checkpoint-1400/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-1400/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-1400/tokenizer.model b/checkpoint-1400/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-1400/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-1400/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json
deleted file mode 100644
index 2d6dd4148df19d0b3e192a7960494fb00c8806d3..0000000000000000000000000000000000000000
--- a/checkpoint-1400/trainer_state.json
+++ /dev/null
@@ -1,8542 +0,0 @@
-{
- "best_metric": 6.766034126281738,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-1000",
- "epoch": 0.010694370177984875,
- "global_step": 1400,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3249,
- "step": 201
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.352,
- "step": 202
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2984,
- "step": 203
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.2734,
- "step": 204
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1,
- "step": 205
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.448,
- "step": 206
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2387,
- "step": 207
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.861,
- "step": 208
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.603,
- "step": 209
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.29,
- "step": 210
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2105,
- "step": 211
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1949,
- "step": 212
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0538,
- "step": 213
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0343,
- "step": 214
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7794,
- "step": 215
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5532,
- "step": 216
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2676,
- "step": 217
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 218
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0432,
- "step": 219
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9391,
- "step": 220
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.724,
- "step": 221
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.229,
- "step": 222
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3462,
- "step": 223
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0752,
- "step": 224
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1966,
- "step": 225
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7279,
- "step": 226
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8484,
- "step": 227
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7291,
- "step": 228
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2665,
- "step": 229
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3551,
- "step": 230
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7338,
- "step": 231
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8407,
- "step": 232
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3581,
- "step": 233
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.441,
- "step": 234
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0788,
- "step": 235
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8404,
- "step": 236
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4314,
- "step": 237
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 238
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0205,
- "step": 239
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4162,
- "step": 240
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7515,
- "step": 241
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1442,
- "step": 242
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5868,
- "step": 243
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6514,
- "step": 244
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2683,
- "step": 245
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.31,
- "step": 246
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0161,
- "step": 247
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.484,
- "step": 248
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9726,
- "step": 249
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0926,
- "step": 250
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5279,
- "step": 251
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0017,
- "step": 252
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5684,
- "step": 253
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 254
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9489,
- "step": 255
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8948,
- "step": 256
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0856,
- "step": 257
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.599,
- "step": 258
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1575,
- "step": 259
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3701,
- "step": 260
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.464,
- "step": 261
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9193,
- "step": 262
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5679,
- "step": 263
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9424,
- "step": 264
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6689,
- "step": 265
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6475,
- "step": 266
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4311,
- "step": 267
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7426,
- "step": 268
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5191,
- "step": 269
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3059,
- "step": 270
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0142,
- "step": 271
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.4509,
- "step": 272
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0831,
- "step": 273
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6977,
- "step": 274
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4236,
- "step": 275
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2129,
- "step": 276
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1394,
- "step": 277
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.685,
- "step": 278
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0275,
- "step": 279
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.3215,
- "step": 280
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6542,
- "step": 281
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7614,
- "step": 282
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2996,
- "step": 283
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6275,
- "step": 284
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8736,
- "step": 285
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4667,
- "step": 286
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8486,
- "step": 287
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2125,
- "step": 288
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4523,
- "step": 289
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.551,
- "step": 290
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7158,
- "step": 291
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5092,
- "step": 292
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9169,
- "step": 293
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5333,
- "step": 294
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9949,
- "step": 295
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7189,
- "step": 296
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2366,
- "step": 297
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4745,
- "step": 298
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2439,
- "step": 299
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4176,
- "step": 300
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9365,
- "step": 301
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5309,
- "step": 302
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2201,
- "step": 303
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0312,
- "step": 304
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4173,
- "step": 305
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4856,
- "step": 306
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5041,
- "step": 307
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3597,
- "step": 308
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8395,
- "step": 309
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0776,
- "step": 310
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7566,
- "step": 311
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9767,
- "step": 312
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3804,
- "step": 313
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5327,
- "step": 314
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5293,
- "step": 315
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4531,
- "step": 316
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3961,
- "step": 317
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5669,
- "step": 318
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8559,
- "step": 319
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.117,
- "step": 320
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4279,
- "step": 321
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7977,
- "step": 322
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.955,
- "step": 323
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0164,
- "step": 324
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.0495,
- "step": 325
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2768,
- "step": 326
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3162,
- "step": 327
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.88,
- "step": 328
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2157,
- "step": 329
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8427,
- "step": 330
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9729,
- "step": 331
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1779,
- "step": 332
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1302,
- "step": 333
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7705,
- "step": 334
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.523,
- "step": 335
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9375,
- "step": 336
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.1409,
- "step": 337
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 338
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6481,
- "step": 339
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.933,
- "step": 340
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9179,
- "step": 341
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9332,
- "step": 342
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6553,
- "step": 343
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7412,
- "step": 344
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.849,
- "step": 345
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7321,
- "step": 346
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9717,
- "step": 347
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3465,
- "step": 348
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4535,
- "step": 349
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2376,
- "step": 350
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9025,
- "step": 351
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.916,
- "step": 352
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3785,
- "step": 353
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0576,
- "step": 354
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5081,
- "step": 355
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1303,
- "step": 356
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3854,
- "step": 357
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5553,
- "step": 358
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9627,
- "step": 359
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.402,
- "step": 360
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3484,
- "step": 361
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5428,
- "step": 362
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9128,
- "step": 363
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3934,
- "step": 364
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4812,
- "step": 365
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5395,
- "step": 366
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6304,
- "step": 367
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5626,
- "step": 368
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5693,
- "step": 369
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3458,
- "step": 370
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6254,
- "step": 371
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8706,
- "step": 372
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6076,
- "step": 373
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.2912,
- "step": 374
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3326,
- "step": 375
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3735,
- "step": 376
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4916,
- "step": 377
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5553,
- "step": 378
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6241,
- "step": 379
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6106,
- "step": 380
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.266,
- "step": 381
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7738,
- "step": 382
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4988,
- "step": 383
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2968,
- "step": 384
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8512,
- "step": 385
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0341,
- "step": 386
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.898,
- "step": 387
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.23,
- "step": 388
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9608,
- "step": 389
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.3679,
- "step": 390
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7074,
- "step": 391
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9903,
- "step": 392
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5845,
- "step": 393
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6493,
- "step": 394
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7962,
- "step": 395
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4865,
- "step": 396
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 397
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3942,
- "step": 398
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4715,
- "step": 399
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2073,
- "step": 400
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.106412410736084,
- "eval_runtime": 22.5667,
- "eval_samples_per_second": 2.216,
- "eval_steps_per_second": 1.108,
- "step": 400
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 2.9128687667846678,
- "step": 400
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3984,
- "step": 401
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7983,
- "step": 402
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8589,
- "step": 403
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9884,
- "step": 404
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4427,
- "step": 405
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0374,
- "step": 406
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7999,
- "step": 407
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2437,
- "step": 408
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6902,
- "step": 409
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.81,
- "step": 410
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8979,
- "step": 411
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0211,
- "step": 412
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3945,
- "step": 413
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5807,
- "step": 414
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1433,
- "step": 415
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9466,
- "step": 416
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6276,
- "step": 417
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4945,
- "step": 418
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6215,
- "step": 419
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.3919,
- "step": 420
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7915,
- "step": 421
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3284,
- "step": 422
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8723,
- "step": 423
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0149,
- "step": 424
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.979,
- "step": 425
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9175,
- "step": 426
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4994,
- "step": 427
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9791,
- "step": 428
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1156,
- "step": 429
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5813,
- "step": 430
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1882,
- "step": 431
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9956,
- "step": 432
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6189,
- "step": 433
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9624,
- "step": 434
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5387,
- "step": 435
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4605,
- "step": 436
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.474,
- "step": 437
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 438
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5705,
- "step": 439
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.275,
- "step": 440
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9638,
- "step": 441
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4857,
- "step": 442
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3067,
- "step": 443
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8152,
- "step": 444
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1668,
- "step": 445
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5293,
- "step": 446
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3981,
- "step": 447
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4787,
- "step": 448
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5981,
- "step": 449
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3569,
- "step": 450
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4088,
- "step": 451
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3677,
- "step": 452
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4686,
- "step": 453
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3552,
- "step": 454
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7931,
- "step": 455
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9285,
- "step": 456
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0554,
- "step": 457
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7277,
- "step": 458
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2474,
- "step": 459
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9274,
- "step": 460
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2558,
- "step": 461
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7547,
- "step": 462
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1264,
- "step": 463
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2124,
- "step": 464
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8751,
- "step": 465
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7317,
- "step": 466
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3697,
- "step": 467
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0021,
- "step": 468
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3761,
- "step": 469
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2291,
- "step": 470
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7968,
- "step": 471
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9454,
- "step": 472
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0194,
- "step": 473
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5048,
- "step": 474
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6837,
- "step": 475
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1066,
- "step": 476
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3501,
- "step": 477
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5071,
- "step": 478
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1086,
- "step": 479
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7269,
- "step": 480
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5419,
- "step": 481
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 482
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1433,
- "step": 483
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0869,
- "step": 484
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.032,
- "step": 485
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0946,
- "step": 486
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7162,
- "step": 487
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0406,
- "step": 488
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9048,
- "step": 489
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2231,
- "step": 490
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.6524,
- "step": 491
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1151,
- "step": 492
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.591,
- "step": 493
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1628,
- "step": 494
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0757,
- "step": 495
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3471,
- "step": 496
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9385,
- "step": 497
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9362,
- "step": 498
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2252,
- "step": 499
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.359,
- "step": 500
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 501
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0484,
- "step": 502
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5773,
- "step": 503
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.39,
- "step": 504
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5923,
- "step": 505
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2,
- "step": 506
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5536,
- "step": 507
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.8958,
- "step": 508
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7763,
- "step": 509
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2045,
- "step": 510
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4219,
- "step": 511
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6305,
- "step": 512
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4243,
- "step": 513
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7842,
- "step": 514
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8769,
- "step": 515
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8903,
- "step": 516
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0489,
- "step": 517
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1314,
- "step": 518
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5973,
- "step": 519
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8022,
- "step": 520
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3539,
- "step": 521
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.222,
- "step": 522
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5403,
- "step": 523
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1323,
- "step": 524
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7813,
- "step": 525
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4982,
- "step": 526
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2426,
- "step": 527
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0142,
- "step": 528
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8996,
- "step": 529
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8671,
- "step": 530
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4139,
- "step": 531
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9478,
- "step": 532
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7062,
- "step": 533
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0098,
- "step": 534
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9195,
- "step": 535
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0255,
- "step": 536
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6291,
- "step": 537
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3245,
- "step": 538
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6382,
- "step": 539
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8076,
- "step": 540
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6725,
- "step": 541
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0563,
- "step": 542
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6178,
- "step": 543
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7974,
- "step": 544
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7535,
- "step": 545
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4948,
- "step": 546
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8941,
- "step": 547
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6496,
- "step": 548
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9084,
- "step": 549
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.65,
- "step": 550
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7673,
- "step": 551
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2221,
- "step": 552
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.14,
- "step": 553
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6747,
- "step": 554
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8009,
- "step": 555
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7307,
- "step": 556
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0143,
- "step": 557
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8098,
- "step": 558
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.026,
- "step": 559
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4572,
- "step": 560
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7913,
- "step": 561
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9962,
- "step": 562
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.767,
- "step": 563
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9497,
- "step": 564
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9626,
- "step": 565
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2536,
- "step": 566
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0421,
- "step": 567
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8177,
- "step": 568
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9241,
- "step": 569
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0162,
- "step": 570
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3368,
- "step": 571
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7515,
- "step": 572
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6389,
- "step": 573
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.662,
- "step": 574
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8097,
- "step": 575
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9346,
- "step": 576
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3154,
- "step": 577
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7724,
- "step": 578
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3685,
- "step": 579
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2775,
- "step": 580
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.106,
- "step": 581
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4733,
- "step": 582
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2334,
- "step": 583
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9478,
- "step": 584
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0013,
- "step": 585
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7242,
- "step": 586
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.922,
- "step": 587
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1418,
- "step": 588
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4472,
- "step": 589
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4785,
- "step": 590
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.783,
- "step": 591
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0706,
- "step": 592
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4136,
- "step": 593
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5969,
- "step": 594
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5157,
- "step": 595
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5658,
- "step": 596
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4647,
- "step": 597
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2028,
- "step": 598
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6913,
- "step": 599
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7239,
- "step": 600
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.012163162231445,
- "eval_runtime": 22.5807,
- "eval_samples_per_second": 2.214,
- "eval_steps_per_second": 1.107,
- "step": 600
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 4.24488224029541,
- "step": 600
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5253,
- "step": 601
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0392,
- "step": 602
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.447,
- "step": 603
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9441,
- "step": 604
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1874,
- "step": 605
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7817,
- "step": 606
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0348,
- "step": 607
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5593,
- "step": 608
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9361,
- "step": 609
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3534,
- "step": 610
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.476,
- "step": 611
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0937,
- "step": 612
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 613
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5586,
- "step": 614
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3796,
- "step": 615
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.676,
- "step": 616
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5321,
- "step": 617
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0059,
- "step": 618
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6139,
- "step": 619
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2391,
- "step": 620
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0636,
- "step": 621
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0895,
- "step": 622
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.62,
- "step": 623
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0469,
- "step": 624
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2173,
- "step": 625
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9432,
- "step": 626
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3928,
- "step": 627
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0959,
- "step": 628
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1197,
- "step": 629
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4277,
- "step": 630
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.418,
- "step": 631
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8687,
- "step": 632
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0156,
- "step": 633
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.573,
- "step": 634
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.112,
- "step": 635
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8954,
- "step": 636
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.36,
- "step": 637
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.924,
- "step": 638
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4625,
- "step": 639
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2023,
- "step": 640
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0685,
- "step": 641
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5304,
- "step": 642
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4456,
- "step": 643
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7271,
- "step": 644
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6011,
- "step": 645
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.895,
- "step": 646
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.864,
- "step": 647
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3452,
- "step": 648
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8978,
- "step": 649
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2253,
- "step": 650
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2813,
- "step": 651
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7248,
- "step": 652
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4283,
- "step": 653
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4304,
- "step": 654
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3893,
- "step": 655
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1115,
- "step": 656
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5892,
- "step": 657
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6572,
- "step": 658
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.925,
- "step": 659
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4431,
- "step": 660
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7711,
- "step": 661
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9439,
- "step": 662
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3781,
- "step": 663
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5573,
- "step": 664
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.4476,
- "step": 665
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0057,
- "step": 666
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2702,
- "step": 667
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5717,
- "step": 668
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2242,
- "step": 669
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1,
- "step": 670
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0517,
- "step": 671
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6543,
- "step": 672
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1138,
- "step": 673
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.461,
- "step": 674
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7094,
- "step": 675
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.521,
- "step": 676
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7116,
- "step": 677
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6343,
- "step": 678
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3762,
- "step": 679
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3603,
- "step": 680
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7144,
- "step": 681
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4545,
- "step": 682
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8188,
- "step": 683
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7965,
- "step": 684
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4675,
- "step": 685
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0436,
- "step": 686
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1219,
- "step": 687
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4517,
- "step": 688
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8476,
- "step": 689
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9284,
- "step": 690
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7405,
- "step": 691
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7142,
- "step": 692
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3979,
- "step": 693
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.3285,
- "step": 694
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 695
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4472,
- "step": 696
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7355,
- "step": 697
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7982,
- "step": 698
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4516,
- "step": 699
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.2532,
- "step": 700
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9959,
- "step": 701
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0418,
- "step": 702
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7767,
- "step": 703
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.774,
- "step": 704
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8912,
- "step": 705
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 706
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.6197,
- "step": 707
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4755,
- "step": 708
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8276,
- "step": 709
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2925,
- "step": 710
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3887,
- "step": 711
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1465,
- "step": 712
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5806,
- "step": 713
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3063,
- "step": 714
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6066,
- "step": 715
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1536,
- "step": 716
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5582,
- "step": 717
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0353,
- "step": 718
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6415,
- "step": 719
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8291,
- "step": 720
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7575,
- "step": 721
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9141,
- "step": 722
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5217,
- "step": 723
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4549,
- "step": 724
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8112,
- "step": 725
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2729,
- "step": 726
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8515,
- "step": 727
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9712,
- "step": 728
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.097,
- "step": 729
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0208,
- "step": 730
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1377,
- "step": 731
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4019,
- "step": 732
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9869,
- "step": 733
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2954,
- "step": 734
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4144,
- "step": 735
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8053,
- "step": 736
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8891,
- "step": 737
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.812,
- "step": 738
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2657,
- "step": 739
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3747,
- "step": 740
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0364,
- "step": 741
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8845,
- "step": 742
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.887,
- "step": 743
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0706,
- "step": 744
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6619,
- "step": 745
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2941,
- "step": 746
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9192,
- "step": 747
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9947,
- "step": 748
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6376,
- "step": 749
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0358,
- "step": 750
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4578,
- "step": 751
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7784,
- "step": 752
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.632,
- "step": 753
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8649,
- "step": 754
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7951,
- "step": 755
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3841,
- "step": 756
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4558,
- "step": 757
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7638,
- "step": 758
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9413,
- "step": 759
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0916,
- "step": 760
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1351,
- "step": 761
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6078,
- "step": 762
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7982,
- "step": 763
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6132,
- "step": 764
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.551,
- "step": 765
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3301,
- "step": 766
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4888,
- "step": 767
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1476,
- "step": 768
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4244,
- "step": 769
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6025,
- "step": 770
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.102,
- "step": 771
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.017,
- "step": 772
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4101,
- "step": 773
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1741,
- "step": 774
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1256,
- "step": 775
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5164,
- "step": 776
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6959,
- "step": 777
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7666,
- "step": 778
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4336,
- "step": 779
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 11.8478,
- "step": 780
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8382,
- "step": 781
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 782
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4424,
- "step": 783
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.345,
- "step": 784
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6887,
- "step": 785
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9867,
- "step": 786
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6152,
- "step": 787
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7283,
- "step": 788
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0157,
- "step": 789
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6044,
- "step": 790
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4132,
- "step": 791
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.735,
- "step": 792
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3631,
- "step": 793
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2308,
- "step": 794
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2184,
- "step": 795
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4661,
- "step": 796
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9637,
- "step": 797
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4178,
- "step": 798
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5909,
- "step": 799
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1482,
- "step": 800
- },
- {
- "epoch": 0.01,
- "eval_loss": 7.355834484100342,
- "eval_runtime": 22.6252,
- "eval_samples_per_second": 2.21,
- "eval_steps_per_second": 1.105,
- "step": 800
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 5.191131496429444,
- "step": 800
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.0427,
- "step": 801
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2669,
- "step": 802
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.8026,
- "step": 803
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4949,
- "step": 804
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4491,
- "step": 805
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0383,
- "step": 806
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1213,
- "step": 807
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5158,
- "step": 808
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5648,
- "step": 809
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9932,
- "step": 810
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6441,
- "step": 811
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8661,
- "step": 812
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3609,
- "step": 813
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6828,
- "step": 814
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9693,
- "step": 815
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3733,
- "step": 816
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6286,
- "step": 817
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4349,
- "step": 818
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6706,
- "step": 819
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3089,
- "step": 820
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2394,
- "step": 821
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.963,
- "step": 822
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6564,
- "step": 823
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.997,
- "step": 824
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9261,
- "step": 825
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1421,
- "step": 826
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2335,
- "step": 827
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3432,
- "step": 828
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0154,
- "step": 829
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5135,
- "step": 830
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6226,
- "step": 831
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1672,
- "step": 832
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0853,
- "step": 833
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1213,
- "step": 834
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7815,
- "step": 835
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8916,
- "step": 836
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6464,
- "step": 837
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3307,
- "step": 838
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8165,
- "step": 839
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.886,
- "step": 840
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4781,
- "step": 841
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8351,
- "step": 842
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.358,
- "step": 843
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6501,
- "step": 844
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0864,
- "step": 845
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2922,
- "step": 846
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.9847,
- "step": 847
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2558,
- "step": 848
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0195,
- "step": 849
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.996,
- "step": 850
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5705,
- "step": 851
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4136,
- "step": 852
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6302,
- "step": 853
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8761,
- "step": 854
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4995,
- "step": 855
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4762,
- "step": 856
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5749,
- "step": 857
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0273,
- "step": 858
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8258,
- "step": 859
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1836,
- "step": 860
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5493,
- "step": 861
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1891,
- "step": 862
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7392,
- "step": 863
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1655,
- "step": 864
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5218,
- "step": 865
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3759,
- "step": 866
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2497,
- "step": 867
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5901,
- "step": 868
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0624,
- "step": 869
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.2452,
- "step": 870
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 871
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0826,
- "step": 872
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2703,
- "step": 873
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9088,
- "step": 874
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 875
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2511,
- "step": 876
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4065,
- "step": 877
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.175,
- "step": 878
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8358,
- "step": 879
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3208,
- "step": 880
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2049,
- "step": 881
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8251,
- "step": 882
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4262,
- "step": 883
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2227,
- "step": 884
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1062,
- "step": 885
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9417,
- "step": 886
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3315,
- "step": 887
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0012,
- "step": 888
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6386,
- "step": 889
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0377,
- "step": 890
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6707,
- "step": 891
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4955,
- "step": 892
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7343,
- "step": 893
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8305,
- "step": 894
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7016,
- "step": 895
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7149,
- "step": 896
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 897
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.815,
- "step": 898
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6135,
- "step": 899
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8776,
- "step": 900
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.7288,
- "step": 901
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8019,
- "step": 902
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0921,
- "step": 903
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.824,
- "step": 904
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7151,
- "step": 905
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5054,
- "step": 906
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8095,
- "step": 907
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3218,
- "step": 908
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9993,
- "step": 909
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4433,
- "step": 910
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5863,
- "step": 911
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.505,
- "step": 912
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9734,
- "step": 913
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 914
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4574,
- "step": 915
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2787,
- "step": 916
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8201,
- "step": 917
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2537,
- "step": 918
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1387,
- "step": 919
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7161,
- "step": 920
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2207,
- "step": 921
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7953,
- "step": 922
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9949,
- "step": 923
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9173,
- "step": 924
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7903,
- "step": 925
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4784,
- "step": 926
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2264,
- "step": 927
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 928
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0686,
- "step": 929
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.791,
- "step": 930
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8393,
- "step": 931
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4387,
- "step": 932
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2374,
- "step": 933
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9598,
- "step": 934
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1597,
- "step": 935
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0403,
- "step": 936
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3301,
- "step": 937
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.072,
- "step": 938
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4788,
- "step": 939
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0656,
- "step": 940
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9647,
- "step": 941
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1168,
- "step": 942
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0293,
- "step": 943
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3622,
- "step": 944
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8957,
- "step": 945
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4,
- "step": 946
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6626,
- "step": 947
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8212,
- "step": 948
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8638,
- "step": 949
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6406,
- "step": 950
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7069,
- "step": 951
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1384,
- "step": 952
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.612,
- "step": 953
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7201,
- "step": 954
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3532,
- "step": 955
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1266,
- "step": 956
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6192,
- "step": 957
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.826,
- "step": 958
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9338,
- "step": 959
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4487,
- "step": 960
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.872,
- "step": 961
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8601,
- "step": 962
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7401,
- "step": 963
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5412,
- "step": 964
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2501,
- "step": 965
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6837,
- "step": 966
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6494,
- "step": 967
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.604,
- "step": 968
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.837,
- "step": 969
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3957,
- "step": 970
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3281,
- "step": 971
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8264,
- "step": 972
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6554,
- "step": 973
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5768,
- "step": 974
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4187,
- "step": 975
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8479,
- "step": 976
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9849,
- "step": 977
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6471,
- "step": 978
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8041,
- "step": 979
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8876,
- "step": 980
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6423,
- "step": 981
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5329,
- "step": 982
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2801,
- "step": 983
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1699,
- "step": 984
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6469,
- "step": 985
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6766,
- "step": 986
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7538,
- "step": 987
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9606,
- "step": 988
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0713,
- "step": 989
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4965,
- "step": 990
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3408,
- "step": 991
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4007,
- "step": 992
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8921,
- "step": 993
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8681,
- "step": 994
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.8867,
- "step": 995
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.467,
- "step": 996
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7895,
- "step": 997
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0523,
- "step": 998
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4032,
- "step": 999
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7719,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.766034126281738,
- "eval_runtime": 22.4042,
- "eval_samples_per_second": 2.232,
- "eval_steps_per_second": 1.116,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.338861379623413,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0285,
- "step": 1001
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4571,
- "step": 1002
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7721,
- "step": 1003
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5108,
- "step": 1004
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3813,
- "step": 1005
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.7963,
- "step": 1006
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1101,
- "step": 1007
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.021,
- "step": 1008
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5916,
- "step": 1009
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8813,
- "step": 1010
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1193,
- "step": 1011
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5092,
- "step": 1012
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8569,
- "step": 1013
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.119,
- "step": 1014
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3247,
- "step": 1015
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2358,
- "step": 1016
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2795,
- "step": 1017
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3466,
- "step": 1018
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5443,
- "step": 1019
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7296,
- "step": 1020
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0412,
- "step": 1021
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4829,
- "step": 1022
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7901,
- "step": 1023
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8077,
- "step": 1024
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4887,
- "step": 1025
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3095,
- "step": 1026
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3235,
- "step": 1027
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6315,
- "step": 1028
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4294,
- "step": 1029
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8457,
- "step": 1030
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7583,
- "step": 1031
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3129,
- "step": 1032
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1832,
- "step": 1033
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1764,
- "step": 1034
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0101,
- "step": 1035
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6524,
- "step": 1036
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2825,
- "step": 1037
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2262,
- "step": 1038
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2533,
- "step": 1039
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8794,
- "step": 1040
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7901,
- "step": 1041
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8351,
- "step": 1042
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5888,
- "step": 1043
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8932,
- "step": 1044
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2999,
- "step": 1045
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8396,
- "step": 1046
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4209,
- "step": 1047
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1524,
- "step": 1048
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7784,
- "step": 1049
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0179,
- "step": 1050
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1153,
- "step": 1051
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2149,
- "step": 1052
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0117,
- "step": 1053
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9693,
- "step": 1054
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5656,
- "step": 1055
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5,
- "step": 1056
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.102,
- "step": 1057
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3079,
- "step": 1058
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5754,
- "step": 1059
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6989,
- "step": 1060
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9597,
- "step": 1061
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3743,
- "step": 1062
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8887,
- "step": 1063
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3779,
- "step": 1064
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5001,
- "step": 1065
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4095,
- "step": 1066
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5073,
- "step": 1067
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1331,
- "step": 1068
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.323,
- "step": 1069
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6116,
- "step": 1070
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1212,
- "step": 1071
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0951,
- "step": 1072
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2463,
- "step": 1073
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4488,
- "step": 1074
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.279,
- "step": 1075
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5728,
- "step": 1076
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1362,
- "step": 1077
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6648,
- "step": 1078
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.427,
- "step": 1079
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8145,
- "step": 1080
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5308,
- "step": 1081
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.974,
- "step": 1082
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1965,
- "step": 1083
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8749,
- "step": 1084
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7352,
- "step": 1085
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7934,
- "step": 1086
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6003,
- "step": 1087
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5775,
- "step": 1088
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.519,
- "step": 1089
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7403,
- "step": 1090
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8145,
- "step": 1091
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5776,
- "step": 1092
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3753,
- "step": 1093
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9586,
- "step": 1094
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7263,
- "step": 1095
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7034,
- "step": 1096
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0579,
- "step": 1097
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8419,
- "step": 1098
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0751,
- "step": 1099
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6438,
- "step": 1100
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8744,
- "step": 1101
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4992,
- "step": 1102
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8094,
- "step": 1103
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.162,
- "step": 1104
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8351,
- "step": 1105
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8845,
- "step": 1106
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1894,
- "step": 1107
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.8333,
- "step": 1108
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4226,
- "step": 1109
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0712,
- "step": 1110
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9981,
- "step": 1111
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5885,
- "step": 1112
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1915,
- "step": 1113
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8003,
- "step": 1114
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5566,
- "step": 1115
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4085,
- "step": 1116
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0793,
- "step": 1117
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0909,
- "step": 1118
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2273,
- "step": 1119
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8273,
- "step": 1120
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0231,
- "step": 1121
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7229,
- "step": 1122
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4479,
- "step": 1123
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 1124
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9038,
- "step": 1125
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2653,
- "step": 1126
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 1127
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3003,
- "step": 1128
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7853,
- "step": 1129
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9143,
- "step": 1130
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2573,
- "step": 1131
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7091,
- "step": 1132
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3372,
- "step": 1133
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4165,
- "step": 1134
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4422,
- "step": 1135
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7693,
- "step": 1136
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7802,
- "step": 1137
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7263,
- "step": 1138
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6749,
- "step": 1139
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9459,
- "step": 1140
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9697,
- "step": 1141
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4506,
- "step": 1142
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5099,
- "step": 1143
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1475,
- "step": 1144
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3769,
- "step": 1145
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2035,
- "step": 1146
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6017,
- "step": 1147
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.463,
- "step": 1148
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3844,
- "step": 1149
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5306,
- "step": 1150
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5502,
- "step": 1151
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7394,
- "step": 1152
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5626,
- "step": 1153
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1618,
- "step": 1154
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5174,
- "step": 1155
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1038,
- "step": 1156
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3789,
- "step": 1157
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2985,
- "step": 1158
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4763,
- "step": 1159
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5071,
- "step": 1160
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0827,
- "step": 1161
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7349,
- "step": 1162
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.798,
- "step": 1163
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3176,
- "step": 1164
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8114,
- "step": 1165
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3379,
- "step": 1166
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1157,
- "step": 1167
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4675,
- "step": 1168
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2721,
- "step": 1169
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0603,
- "step": 1170
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6358,
- "step": 1171
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0865,
- "step": 1172
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.709,
- "step": 1173
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7705,
- "step": 1174
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7677,
- "step": 1175
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2418,
- "step": 1176
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7114,
- "step": 1177
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1165,
- "step": 1178
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9654,
- "step": 1179
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0672,
- "step": 1180
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1738,
- "step": 1181
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7604,
- "step": 1182
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 1183
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0231,
- "step": 1184
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2938,
- "step": 1185
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.783,
- "step": 1186
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3328,
- "step": 1187
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.321,
- "step": 1188
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6368,
- "step": 1189
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.101,
- "step": 1190
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6777,
- "step": 1191
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0831,
- "step": 1192
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5853,
- "step": 1193
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7923,
- "step": 1194
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3734,
- "step": 1195
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4268,
- "step": 1196
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6796,
- "step": 1197
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9028,
- "step": 1198
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3716,
- "step": 1199
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6761,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.9188361167907715,
- "eval_runtime": 22.426,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 3.3686839294433595,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8855,
- "step": 1201
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8206,
- "step": 1202
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4401,
- "step": 1203
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2366,
- "step": 1204
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9885,
- "step": 1205
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5444,
- "step": 1206
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4259,
- "step": 1207
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5369,
- "step": 1208
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0839,
- "step": 1209
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7622,
- "step": 1210
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8979,
- "step": 1211
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5508,
- "step": 1212
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6439,
- "step": 1213
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6249,
- "step": 1214
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.495,
- "step": 1215
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0642,
- "step": 1216
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8997,
- "step": 1217
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6275,
- "step": 1218
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3317,
- "step": 1219
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4635,
- "step": 1220
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5197,
- "step": 1221
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5928,
- "step": 1222
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2363,
- "step": 1223
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0266,
- "step": 1224
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3356,
- "step": 1225
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7927,
- "step": 1226
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6952,
- "step": 1227
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8878,
- "step": 1228
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7472,
- "step": 1229
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6454,
- "step": 1230
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4972,
- "step": 1231
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.3347,
- "step": 1232
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1631,
- "step": 1233
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4708,
- "step": 1234
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5697,
- "step": 1235
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8218,
- "step": 1236
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.269,
- "step": 1237
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4165,
- "step": 1238
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3653,
- "step": 1239
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0152,
- "step": 1240
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9157,
- "step": 1241
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4086,
- "step": 1242
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2493,
- "step": 1243
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8279,
- "step": 1244
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6649,
- "step": 1245
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4405,
- "step": 1246
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1992,
- "step": 1247
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2055,
- "step": 1248
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4395,
- "step": 1249
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2475,
- "step": 1250
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8205,
- "step": 1251
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1053,
- "step": 1252
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7494,
- "step": 1253
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7387,
- "step": 1254
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8983,
- "step": 1255
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5614,
- "step": 1256
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7617,
- "step": 1257
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2445,
- "step": 1258
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3043,
- "step": 1259
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4214,
- "step": 1260
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1384,
- "step": 1261
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3914,
- "step": 1262
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3287,
- "step": 1263
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2174,
- "step": 1264
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4397,
- "step": 1265
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6875,
- "step": 1266
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4512,
- "step": 1267
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2834,
- "step": 1268
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7651,
- "step": 1269
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9263,
- "step": 1270
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6721,
- "step": 1271
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9178,
- "step": 1272
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7967,
- "step": 1273
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5242,
- "step": 1274
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7794,
- "step": 1275
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4256,
- "step": 1276
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5788,
- "step": 1277
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7586,
- "step": 1278
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.964,
- "step": 1279
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0749,
- "step": 1280
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6248,
- "step": 1281
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2465,
- "step": 1282
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1591,
- "step": 1283
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4328,
- "step": 1284
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.534,
- "step": 1285
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.523,
- "step": 1286
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5672,
- "step": 1287
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9162,
- "step": 1288
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1089,
- "step": 1289
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3287,
- "step": 1290
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2499,
- "step": 1291
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9645,
- "step": 1292
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3903,
- "step": 1293
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5322,
- "step": 1294
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2211,
- "step": 1295
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2788,
- "step": 1296
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1862,
- "step": 1297
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2678,
- "step": 1298
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5399,
- "step": 1299
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7935,
- "step": 1300
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0391,
- "step": 1301
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1049,
- "step": 1302
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.365,
- "step": 1303
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.8809,
- "step": 1304
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2335,
- "step": 1305
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.5135,
- "step": 1306
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2378,
- "step": 1307
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9265,
- "step": 1308
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.641,
- "step": 1309
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9822,
- "step": 1310
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3369,
- "step": 1311
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3735,
- "step": 1312
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2618,
- "step": 1313
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6854,
- "step": 1314
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3748,
- "step": 1315
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9206,
- "step": 1316
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1969,
- "step": 1317
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1245,
- "step": 1318
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9977,
- "step": 1319
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5319,
- "step": 1320
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4431,
- "step": 1321
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7264,
- "step": 1322
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.05,
- "step": 1323
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3118,
- "step": 1324
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4575,
- "step": 1325
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.593,
- "step": 1326
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0061,
- "step": 1327
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2481,
- "step": 1328
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8017,
- "step": 1329
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8617,
- "step": 1330
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7036,
- "step": 1331
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0091,
- "step": 1332
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9687,
- "step": 1333
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3925,
- "step": 1334
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1127,
- "step": 1335
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8163,
- "step": 1336
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0639,
- "step": 1337
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8929,
- "step": 1338
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5011,
- "step": 1339
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.033,
- "step": 1340
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0526,
- "step": 1341
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4428,
- "step": 1342
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3477,
- "step": 1343
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.881,
- "step": 1344
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.5276,
- "step": 1345
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4183,
- "step": 1346
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4943,
- "step": 1347
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9187,
- "step": 1348
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1003,
- "step": 1349
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1187,
- "step": 1350
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8081,
- "step": 1351
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4695,
- "step": 1352
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5761,
- "step": 1353
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9635,
- "step": 1354
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2133,
- "step": 1355
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2611,
- "step": 1356
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.6885,
- "step": 1357
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1157,
- "step": 1358
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4421,
- "step": 1359
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2128,
- "step": 1360
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6978,
- "step": 1361
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9804,
- "step": 1362
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.3426,
- "step": 1363
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2676,
- "step": 1364
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.325,
- "step": 1365
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1263,
- "step": 1366
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7481,
- "step": 1367
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6891,
- "step": 1368
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8568,
- "step": 1369
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9893,
- "step": 1370
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0302,
- "step": 1371
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3204,
- "step": 1372
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9008,
- "step": 1373
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2624,
- "step": 1374
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6234,
- "step": 1375
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2286,
- "step": 1376
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3426,
- "step": 1377
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1962,
- "step": 1378
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3142,
- "step": 1379
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.414,
- "step": 1380
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0191,
- "step": 1381
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4953,
- "step": 1382
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6694,
- "step": 1383
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8611,
- "step": 1384
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.86,
- "step": 1385
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6519,
- "step": 1386
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.394,
- "step": 1387
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2117,
- "step": 1388
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9924,
- "step": 1389
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.58,
- "step": 1390
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4415,
- "step": 1391
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7196,
- "step": 1392
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7388,
- "step": 1393
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4784,
- "step": 1394
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.496,
- "step": 1395
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8706,
- "step": 1396
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1858,
- "step": 1397
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9038,
- "step": 1398
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4852,
- "step": 1399
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2717,
- "step": 1400
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.97923469543457,
- "eval_runtime": 22.472,
- "eval_samples_per_second": 2.225,
- "eval_steps_per_second": 1.112,
- "step": 1400
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 3.657382688522339,
- "step": 1400
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 2.367869139143885e+16,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-1400/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011
diff --git a/checkpoint-1600/README.md b/checkpoint-1600/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-1600/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-1600/adapter_config.json b/checkpoint-1600/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-1600/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-1600/adapter_model.bin b/checkpoint-1600/adapter_model.bin
deleted file mode 100644
index 061637b166456b007b0b30ddcf2dc35ea3d14ab4..0000000000000000000000000000000000000000
--- a/checkpoint-1600/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b503ce7c2f5a638f100ba5da6d3bb1e96b10f33a707b306f142dcd7320296887
-size 871609293
diff --git a/checkpoint-1600/adapter_model/adapter_model/README.md b/checkpoint-1600/adapter_model/adapter_model/README.md
deleted file mode 100644
index b2a9ac08c477a18d16ef75ee89b21cee91a6169a..0000000000000000000000000000000000000000
--- a/checkpoint-1600/adapter_model/adapter_model/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-- PEFT 0.4.0
-- PEFT 0.4.0
-
-- PEFT 0.4.0
diff --git a/checkpoint-1600/adapter_model/adapter_model/adapter_config.json b/checkpoint-1600/adapter_model/adapter_model/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-1600/adapter_model/adapter_model/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-1600/adapter_model/adapter_model/adapter_model.bin b/checkpoint-1600/adapter_model/adapter_model/adapter_model.bin
deleted file mode 100644
index 21fcecf4036cafc10e52f2215417aad4fd4776d8..0000000000000000000000000000000000000000
--- a/checkpoint-1600/adapter_model/adapter_model/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ca7fae775be00cb7e472a75158d8ab72644b8579a83e290f3c7c2b2bc675e8dc
-size 871609293
diff --git a/checkpoint-1600/added_tokens.json b/checkpoint-1600/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-1600/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-1600/optimizer.pt b/checkpoint-1600/optimizer.pt
deleted file mode 100644
index 065489829f508f752f2f246277fcff2f4eb8d539..0000000000000000000000000000000000000000
--- a/checkpoint-1600/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f3b1009ddf180f8a75e4e4c7e70b09c050d575ea55a95be5113766472ac85f63
-size 873873439
diff --git a/checkpoint-1600/rng_state.pth b/checkpoint-1600/rng_state.pth
deleted file mode 100644
index 979c36e9b1bb882e09cec63ed1e51af5b745420e..0000000000000000000000000000000000000000
--- a/checkpoint-1600/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:42456691be03d4e7f251ecd394f3a1db103fa9136263ad733e87ee530de3678e
-size 14511
diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt
deleted file mode 100644
index b7e22bca944acf064c516ffc4271ea6d41d6d134..0000000000000000000000000000000000000000
--- a/checkpoint-1600/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9a6ae13f0c7f359a6b405d50e69129c1f2abc4f10fc1e47508750a1aa5936c11
-size 627
diff --git a/checkpoint-1600/special_tokens_map.json b/checkpoint-1600/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-1600/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-1600/tokenizer.model b/checkpoint-1600/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-1600/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-1600/tokenizer_config.json b/checkpoint-1600/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-1600/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json
deleted file mode 100644
index ba4ab23e58725e397f554d9f649f3126c58662ff..0000000000000000000000000000000000000000
--- a/checkpoint-1600/trainer_state.json
+++ /dev/null
@@ -1,9760 +0,0 @@
-{
- "best_metric": 6.617897987365723,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-1600",
- "epoch": 0.012222137346268428,
- "global_step": 1600,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3249,
- "step": 201
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.352,
- "step": 202
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2984,
- "step": 203
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.2734,
- "step": 204
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1,
- "step": 205
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.448,
- "step": 206
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2387,
- "step": 207
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.861,
- "step": 208
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.603,
- "step": 209
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.29,
- "step": 210
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2105,
- "step": 211
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1949,
- "step": 212
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0538,
- "step": 213
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0343,
- "step": 214
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7794,
- "step": 215
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5532,
- "step": 216
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2676,
- "step": 217
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 218
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0432,
- "step": 219
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9391,
- "step": 220
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.724,
- "step": 221
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.229,
- "step": 222
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3462,
- "step": 223
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0752,
- "step": 224
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1966,
- "step": 225
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7279,
- "step": 226
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8484,
- "step": 227
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7291,
- "step": 228
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2665,
- "step": 229
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3551,
- "step": 230
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7338,
- "step": 231
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8407,
- "step": 232
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3581,
- "step": 233
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.441,
- "step": 234
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0788,
- "step": 235
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8404,
- "step": 236
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4314,
- "step": 237
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 238
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0205,
- "step": 239
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4162,
- "step": 240
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7515,
- "step": 241
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1442,
- "step": 242
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5868,
- "step": 243
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6514,
- "step": 244
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2683,
- "step": 245
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.31,
- "step": 246
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0161,
- "step": 247
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.484,
- "step": 248
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9726,
- "step": 249
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0926,
- "step": 250
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5279,
- "step": 251
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0017,
- "step": 252
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5684,
- "step": 253
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 254
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9489,
- "step": 255
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8948,
- "step": 256
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0856,
- "step": 257
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.599,
- "step": 258
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1575,
- "step": 259
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3701,
- "step": 260
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.464,
- "step": 261
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9193,
- "step": 262
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5679,
- "step": 263
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9424,
- "step": 264
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6689,
- "step": 265
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6475,
- "step": 266
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4311,
- "step": 267
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7426,
- "step": 268
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5191,
- "step": 269
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3059,
- "step": 270
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0142,
- "step": 271
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.4509,
- "step": 272
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0831,
- "step": 273
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6977,
- "step": 274
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4236,
- "step": 275
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2129,
- "step": 276
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1394,
- "step": 277
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.685,
- "step": 278
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0275,
- "step": 279
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.3215,
- "step": 280
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6542,
- "step": 281
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7614,
- "step": 282
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2996,
- "step": 283
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6275,
- "step": 284
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8736,
- "step": 285
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4667,
- "step": 286
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8486,
- "step": 287
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2125,
- "step": 288
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4523,
- "step": 289
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.551,
- "step": 290
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7158,
- "step": 291
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5092,
- "step": 292
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9169,
- "step": 293
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5333,
- "step": 294
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9949,
- "step": 295
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7189,
- "step": 296
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2366,
- "step": 297
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4745,
- "step": 298
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2439,
- "step": 299
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4176,
- "step": 300
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9365,
- "step": 301
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5309,
- "step": 302
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2201,
- "step": 303
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0312,
- "step": 304
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4173,
- "step": 305
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4856,
- "step": 306
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5041,
- "step": 307
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3597,
- "step": 308
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8395,
- "step": 309
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0776,
- "step": 310
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7566,
- "step": 311
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9767,
- "step": 312
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3804,
- "step": 313
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5327,
- "step": 314
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5293,
- "step": 315
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4531,
- "step": 316
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3961,
- "step": 317
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5669,
- "step": 318
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8559,
- "step": 319
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.117,
- "step": 320
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4279,
- "step": 321
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7977,
- "step": 322
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.955,
- "step": 323
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0164,
- "step": 324
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.0495,
- "step": 325
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2768,
- "step": 326
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3162,
- "step": 327
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.88,
- "step": 328
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2157,
- "step": 329
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8427,
- "step": 330
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9729,
- "step": 331
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1779,
- "step": 332
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1302,
- "step": 333
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7705,
- "step": 334
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.523,
- "step": 335
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9375,
- "step": 336
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.1409,
- "step": 337
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 338
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6481,
- "step": 339
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.933,
- "step": 340
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9179,
- "step": 341
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9332,
- "step": 342
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6553,
- "step": 343
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7412,
- "step": 344
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.849,
- "step": 345
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7321,
- "step": 346
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9717,
- "step": 347
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3465,
- "step": 348
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4535,
- "step": 349
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2376,
- "step": 350
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9025,
- "step": 351
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.916,
- "step": 352
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3785,
- "step": 353
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0576,
- "step": 354
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5081,
- "step": 355
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1303,
- "step": 356
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3854,
- "step": 357
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5553,
- "step": 358
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9627,
- "step": 359
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.402,
- "step": 360
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3484,
- "step": 361
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5428,
- "step": 362
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9128,
- "step": 363
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3934,
- "step": 364
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4812,
- "step": 365
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5395,
- "step": 366
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6304,
- "step": 367
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5626,
- "step": 368
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5693,
- "step": 369
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3458,
- "step": 370
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6254,
- "step": 371
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8706,
- "step": 372
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6076,
- "step": 373
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.2912,
- "step": 374
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3326,
- "step": 375
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3735,
- "step": 376
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4916,
- "step": 377
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5553,
- "step": 378
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6241,
- "step": 379
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6106,
- "step": 380
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.266,
- "step": 381
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7738,
- "step": 382
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4988,
- "step": 383
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2968,
- "step": 384
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8512,
- "step": 385
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0341,
- "step": 386
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.898,
- "step": 387
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.23,
- "step": 388
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9608,
- "step": 389
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.3679,
- "step": 390
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7074,
- "step": 391
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9903,
- "step": 392
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5845,
- "step": 393
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6493,
- "step": 394
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7962,
- "step": 395
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4865,
- "step": 396
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 397
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3942,
- "step": 398
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4715,
- "step": 399
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2073,
- "step": 400
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.106412410736084,
- "eval_runtime": 22.5667,
- "eval_samples_per_second": 2.216,
- "eval_steps_per_second": 1.108,
- "step": 400
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 2.9128687667846678,
- "step": 400
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3984,
- "step": 401
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7983,
- "step": 402
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8589,
- "step": 403
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9884,
- "step": 404
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4427,
- "step": 405
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0374,
- "step": 406
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7999,
- "step": 407
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2437,
- "step": 408
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6902,
- "step": 409
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.81,
- "step": 410
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8979,
- "step": 411
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0211,
- "step": 412
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3945,
- "step": 413
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5807,
- "step": 414
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1433,
- "step": 415
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9466,
- "step": 416
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6276,
- "step": 417
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4945,
- "step": 418
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6215,
- "step": 419
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.3919,
- "step": 420
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7915,
- "step": 421
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3284,
- "step": 422
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8723,
- "step": 423
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0149,
- "step": 424
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.979,
- "step": 425
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9175,
- "step": 426
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4994,
- "step": 427
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9791,
- "step": 428
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1156,
- "step": 429
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5813,
- "step": 430
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1882,
- "step": 431
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9956,
- "step": 432
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6189,
- "step": 433
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9624,
- "step": 434
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5387,
- "step": 435
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4605,
- "step": 436
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.474,
- "step": 437
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 438
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5705,
- "step": 439
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.275,
- "step": 440
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9638,
- "step": 441
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4857,
- "step": 442
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3067,
- "step": 443
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8152,
- "step": 444
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1668,
- "step": 445
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5293,
- "step": 446
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3981,
- "step": 447
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4787,
- "step": 448
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5981,
- "step": 449
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3569,
- "step": 450
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4088,
- "step": 451
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3677,
- "step": 452
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4686,
- "step": 453
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3552,
- "step": 454
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7931,
- "step": 455
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9285,
- "step": 456
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0554,
- "step": 457
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7277,
- "step": 458
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2474,
- "step": 459
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9274,
- "step": 460
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2558,
- "step": 461
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7547,
- "step": 462
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1264,
- "step": 463
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2124,
- "step": 464
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8751,
- "step": 465
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7317,
- "step": 466
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3697,
- "step": 467
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0021,
- "step": 468
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3761,
- "step": 469
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2291,
- "step": 470
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7968,
- "step": 471
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9454,
- "step": 472
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0194,
- "step": 473
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5048,
- "step": 474
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6837,
- "step": 475
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1066,
- "step": 476
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3501,
- "step": 477
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5071,
- "step": 478
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1086,
- "step": 479
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7269,
- "step": 480
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5419,
- "step": 481
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 482
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1433,
- "step": 483
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0869,
- "step": 484
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.032,
- "step": 485
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0946,
- "step": 486
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7162,
- "step": 487
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0406,
- "step": 488
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9048,
- "step": 489
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2231,
- "step": 490
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.6524,
- "step": 491
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1151,
- "step": 492
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.591,
- "step": 493
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1628,
- "step": 494
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0757,
- "step": 495
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3471,
- "step": 496
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9385,
- "step": 497
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9362,
- "step": 498
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2252,
- "step": 499
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.359,
- "step": 500
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 501
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0484,
- "step": 502
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5773,
- "step": 503
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.39,
- "step": 504
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5923,
- "step": 505
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2,
- "step": 506
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5536,
- "step": 507
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.8958,
- "step": 508
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7763,
- "step": 509
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2045,
- "step": 510
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4219,
- "step": 511
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6305,
- "step": 512
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4243,
- "step": 513
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7842,
- "step": 514
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8769,
- "step": 515
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8903,
- "step": 516
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0489,
- "step": 517
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1314,
- "step": 518
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5973,
- "step": 519
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8022,
- "step": 520
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3539,
- "step": 521
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.222,
- "step": 522
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5403,
- "step": 523
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1323,
- "step": 524
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7813,
- "step": 525
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4982,
- "step": 526
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2426,
- "step": 527
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0142,
- "step": 528
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8996,
- "step": 529
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8671,
- "step": 530
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4139,
- "step": 531
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9478,
- "step": 532
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7062,
- "step": 533
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0098,
- "step": 534
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9195,
- "step": 535
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0255,
- "step": 536
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6291,
- "step": 537
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3245,
- "step": 538
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6382,
- "step": 539
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8076,
- "step": 540
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6725,
- "step": 541
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0563,
- "step": 542
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6178,
- "step": 543
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7974,
- "step": 544
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7535,
- "step": 545
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4948,
- "step": 546
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8941,
- "step": 547
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6496,
- "step": 548
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9084,
- "step": 549
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.65,
- "step": 550
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7673,
- "step": 551
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2221,
- "step": 552
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.14,
- "step": 553
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6747,
- "step": 554
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8009,
- "step": 555
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7307,
- "step": 556
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0143,
- "step": 557
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8098,
- "step": 558
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.026,
- "step": 559
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4572,
- "step": 560
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7913,
- "step": 561
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9962,
- "step": 562
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.767,
- "step": 563
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9497,
- "step": 564
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9626,
- "step": 565
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2536,
- "step": 566
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0421,
- "step": 567
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8177,
- "step": 568
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9241,
- "step": 569
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0162,
- "step": 570
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3368,
- "step": 571
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7515,
- "step": 572
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6389,
- "step": 573
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.662,
- "step": 574
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8097,
- "step": 575
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9346,
- "step": 576
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3154,
- "step": 577
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7724,
- "step": 578
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3685,
- "step": 579
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2775,
- "step": 580
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.106,
- "step": 581
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4733,
- "step": 582
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2334,
- "step": 583
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9478,
- "step": 584
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0013,
- "step": 585
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7242,
- "step": 586
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.922,
- "step": 587
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1418,
- "step": 588
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4472,
- "step": 589
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4785,
- "step": 590
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.783,
- "step": 591
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0706,
- "step": 592
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4136,
- "step": 593
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5969,
- "step": 594
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5157,
- "step": 595
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5658,
- "step": 596
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4647,
- "step": 597
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2028,
- "step": 598
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6913,
- "step": 599
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7239,
- "step": 600
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.012163162231445,
- "eval_runtime": 22.5807,
- "eval_samples_per_second": 2.214,
- "eval_steps_per_second": 1.107,
- "step": 600
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 4.24488224029541,
- "step": 600
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5253,
- "step": 601
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0392,
- "step": 602
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.447,
- "step": 603
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9441,
- "step": 604
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1874,
- "step": 605
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7817,
- "step": 606
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0348,
- "step": 607
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5593,
- "step": 608
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9361,
- "step": 609
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3534,
- "step": 610
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.476,
- "step": 611
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0937,
- "step": 612
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 613
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5586,
- "step": 614
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3796,
- "step": 615
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.676,
- "step": 616
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5321,
- "step": 617
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0059,
- "step": 618
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6139,
- "step": 619
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2391,
- "step": 620
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0636,
- "step": 621
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0895,
- "step": 622
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.62,
- "step": 623
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0469,
- "step": 624
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2173,
- "step": 625
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9432,
- "step": 626
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3928,
- "step": 627
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0959,
- "step": 628
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1197,
- "step": 629
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4277,
- "step": 630
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.418,
- "step": 631
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8687,
- "step": 632
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0156,
- "step": 633
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.573,
- "step": 634
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.112,
- "step": 635
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8954,
- "step": 636
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.36,
- "step": 637
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.924,
- "step": 638
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4625,
- "step": 639
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2023,
- "step": 640
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0685,
- "step": 641
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5304,
- "step": 642
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4456,
- "step": 643
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7271,
- "step": 644
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6011,
- "step": 645
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.895,
- "step": 646
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.864,
- "step": 647
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3452,
- "step": 648
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8978,
- "step": 649
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2253,
- "step": 650
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2813,
- "step": 651
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7248,
- "step": 652
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4283,
- "step": 653
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4304,
- "step": 654
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3893,
- "step": 655
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1115,
- "step": 656
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5892,
- "step": 657
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6572,
- "step": 658
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.925,
- "step": 659
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4431,
- "step": 660
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7711,
- "step": 661
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9439,
- "step": 662
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3781,
- "step": 663
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5573,
- "step": 664
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.4476,
- "step": 665
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0057,
- "step": 666
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2702,
- "step": 667
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5717,
- "step": 668
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2242,
- "step": 669
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1,
- "step": 670
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0517,
- "step": 671
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6543,
- "step": 672
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1138,
- "step": 673
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.461,
- "step": 674
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7094,
- "step": 675
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.521,
- "step": 676
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7116,
- "step": 677
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6343,
- "step": 678
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3762,
- "step": 679
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3603,
- "step": 680
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7144,
- "step": 681
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4545,
- "step": 682
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8188,
- "step": 683
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7965,
- "step": 684
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4675,
- "step": 685
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0436,
- "step": 686
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1219,
- "step": 687
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4517,
- "step": 688
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8476,
- "step": 689
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9284,
- "step": 690
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7405,
- "step": 691
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7142,
- "step": 692
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3979,
- "step": 693
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.3285,
- "step": 694
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 695
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4472,
- "step": 696
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7355,
- "step": 697
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7982,
- "step": 698
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4516,
- "step": 699
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.2532,
- "step": 700
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9959,
- "step": 701
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0418,
- "step": 702
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7767,
- "step": 703
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.774,
- "step": 704
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8912,
- "step": 705
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 706
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.6197,
- "step": 707
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4755,
- "step": 708
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8276,
- "step": 709
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2925,
- "step": 710
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3887,
- "step": 711
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1465,
- "step": 712
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5806,
- "step": 713
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3063,
- "step": 714
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6066,
- "step": 715
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1536,
- "step": 716
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5582,
- "step": 717
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0353,
- "step": 718
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6415,
- "step": 719
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8291,
- "step": 720
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7575,
- "step": 721
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9141,
- "step": 722
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5217,
- "step": 723
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4549,
- "step": 724
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8112,
- "step": 725
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2729,
- "step": 726
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8515,
- "step": 727
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9712,
- "step": 728
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.097,
- "step": 729
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0208,
- "step": 730
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1377,
- "step": 731
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4019,
- "step": 732
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9869,
- "step": 733
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2954,
- "step": 734
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4144,
- "step": 735
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8053,
- "step": 736
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8891,
- "step": 737
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.812,
- "step": 738
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2657,
- "step": 739
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3747,
- "step": 740
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0364,
- "step": 741
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8845,
- "step": 742
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.887,
- "step": 743
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0706,
- "step": 744
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6619,
- "step": 745
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2941,
- "step": 746
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9192,
- "step": 747
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9947,
- "step": 748
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6376,
- "step": 749
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0358,
- "step": 750
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4578,
- "step": 751
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7784,
- "step": 752
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.632,
- "step": 753
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8649,
- "step": 754
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7951,
- "step": 755
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3841,
- "step": 756
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4558,
- "step": 757
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7638,
- "step": 758
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9413,
- "step": 759
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0916,
- "step": 760
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1351,
- "step": 761
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6078,
- "step": 762
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7982,
- "step": 763
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6132,
- "step": 764
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.551,
- "step": 765
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3301,
- "step": 766
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4888,
- "step": 767
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1476,
- "step": 768
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4244,
- "step": 769
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6025,
- "step": 770
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.102,
- "step": 771
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.017,
- "step": 772
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4101,
- "step": 773
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1741,
- "step": 774
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1256,
- "step": 775
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5164,
- "step": 776
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6959,
- "step": 777
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7666,
- "step": 778
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4336,
- "step": 779
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 11.8478,
- "step": 780
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8382,
- "step": 781
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 782
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4424,
- "step": 783
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.345,
- "step": 784
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6887,
- "step": 785
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9867,
- "step": 786
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6152,
- "step": 787
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7283,
- "step": 788
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0157,
- "step": 789
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6044,
- "step": 790
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4132,
- "step": 791
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.735,
- "step": 792
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3631,
- "step": 793
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2308,
- "step": 794
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2184,
- "step": 795
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4661,
- "step": 796
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9637,
- "step": 797
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4178,
- "step": 798
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5909,
- "step": 799
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1482,
- "step": 800
- },
- {
- "epoch": 0.01,
- "eval_loss": 7.355834484100342,
- "eval_runtime": 22.6252,
- "eval_samples_per_second": 2.21,
- "eval_steps_per_second": 1.105,
- "step": 800
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 5.191131496429444,
- "step": 800
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.0427,
- "step": 801
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2669,
- "step": 802
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.8026,
- "step": 803
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4949,
- "step": 804
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4491,
- "step": 805
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0383,
- "step": 806
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1213,
- "step": 807
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5158,
- "step": 808
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5648,
- "step": 809
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9932,
- "step": 810
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6441,
- "step": 811
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8661,
- "step": 812
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3609,
- "step": 813
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6828,
- "step": 814
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9693,
- "step": 815
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3733,
- "step": 816
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6286,
- "step": 817
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4349,
- "step": 818
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6706,
- "step": 819
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3089,
- "step": 820
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2394,
- "step": 821
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.963,
- "step": 822
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6564,
- "step": 823
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.997,
- "step": 824
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9261,
- "step": 825
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1421,
- "step": 826
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2335,
- "step": 827
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3432,
- "step": 828
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0154,
- "step": 829
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5135,
- "step": 830
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6226,
- "step": 831
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1672,
- "step": 832
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0853,
- "step": 833
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1213,
- "step": 834
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7815,
- "step": 835
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8916,
- "step": 836
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6464,
- "step": 837
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3307,
- "step": 838
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8165,
- "step": 839
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.886,
- "step": 840
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4781,
- "step": 841
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8351,
- "step": 842
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.358,
- "step": 843
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6501,
- "step": 844
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0864,
- "step": 845
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2922,
- "step": 846
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.9847,
- "step": 847
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2558,
- "step": 848
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0195,
- "step": 849
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.996,
- "step": 850
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5705,
- "step": 851
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4136,
- "step": 852
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6302,
- "step": 853
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8761,
- "step": 854
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4995,
- "step": 855
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4762,
- "step": 856
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5749,
- "step": 857
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0273,
- "step": 858
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8258,
- "step": 859
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1836,
- "step": 860
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5493,
- "step": 861
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1891,
- "step": 862
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7392,
- "step": 863
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1655,
- "step": 864
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5218,
- "step": 865
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3759,
- "step": 866
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2497,
- "step": 867
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5901,
- "step": 868
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0624,
- "step": 869
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.2452,
- "step": 870
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 871
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0826,
- "step": 872
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2703,
- "step": 873
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9088,
- "step": 874
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 875
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2511,
- "step": 876
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4065,
- "step": 877
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.175,
- "step": 878
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8358,
- "step": 879
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3208,
- "step": 880
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2049,
- "step": 881
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8251,
- "step": 882
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4262,
- "step": 883
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2227,
- "step": 884
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1062,
- "step": 885
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9417,
- "step": 886
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3315,
- "step": 887
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0012,
- "step": 888
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6386,
- "step": 889
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0377,
- "step": 890
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6707,
- "step": 891
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4955,
- "step": 892
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7343,
- "step": 893
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8305,
- "step": 894
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7016,
- "step": 895
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7149,
- "step": 896
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5649,
- "step": 897
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.815,
- "step": 898
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6135,
- "step": 899
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8776,
- "step": 900
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.7288,
- "step": 901
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8019,
- "step": 902
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0921,
- "step": 903
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.824,
- "step": 904
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7151,
- "step": 905
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5054,
- "step": 906
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8095,
- "step": 907
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3218,
- "step": 908
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9993,
- "step": 909
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4433,
- "step": 910
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5863,
- "step": 911
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.505,
- "step": 912
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9734,
- "step": 913
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 914
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4574,
- "step": 915
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2787,
- "step": 916
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8201,
- "step": 917
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2537,
- "step": 918
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1387,
- "step": 919
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7161,
- "step": 920
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2207,
- "step": 921
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7953,
- "step": 922
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9949,
- "step": 923
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9173,
- "step": 924
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7903,
- "step": 925
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4784,
- "step": 926
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2264,
- "step": 927
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 928
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0686,
- "step": 929
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.791,
- "step": 930
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8393,
- "step": 931
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4387,
- "step": 932
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2374,
- "step": 933
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9598,
- "step": 934
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1597,
- "step": 935
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0403,
- "step": 936
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3301,
- "step": 937
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.072,
- "step": 938
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4788,
- "step": 939
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0656,
- "step": 940
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9647,
- "step": 941
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1168,
- "step": 942
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0293,
- "step": 943
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3622,
- "step": 944
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8957,
- "step": 945
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4,
- "step": 946
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6626,
- "step": 947
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8212,
- "step": 948
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8638,
- "step": 949
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6406,
- "step": 950
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7069,
- "step": 951
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1384,
- "step": 952
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.612,
- "step": 953
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7201,
- "step": 954
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3532,
- "step": 955
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1266,
- "step": 956
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6192,
- "step": 957
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.826,
- "step": 958
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9338,
- "step": 959
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4487,
- "step": 960
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.872,
- "step": 961
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8601,
- "step": 962
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7401,
- "step": 963
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5412,
- "step": 964
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2501,
- "step": 965
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6837,
- "step": 966
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6494,
- "step": 967
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.604,
- "step": 968
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.837,
- "step": 969
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3957,
- "step": 970
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3281,
- "step": 971
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8264,
- "step": 972
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6554,
- "step": 973
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5768,
- "step": 974
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4187,
- "step": 975
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8479,
- "step": 976
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9849,
- "step": 977
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6471,
- "step": 978
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8041,
- "step": 979
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8876,
- "step": 980
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6423,
- "step": 981
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5329,
- "step": 982
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2801,
- "step": 983
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1699,
- "step": 984
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6469,
- "step": 985
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6766,
- "step": 986
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7538,
- "step": 987
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9606,
- "step": 988
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0713,
- "step": 989
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4965,
- "step": 990
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3408,
- "step": 991
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4007,
- "step": 992
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8921,
- "step": 993
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8681,
- "step": 994
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.8867,
- "step": 995
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.467,
- "step": 996
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7895,
- "step": 997
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0523,
- "step": 998
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4032,
- "step": 999
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7719,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.766034126281738,
- "eval_runtime": 22.4042,
- "eval_samples_per_second": 2.232,
- "eval_steps_per_second": 1.116,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.338861379623413,
- "step": 1000
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0285,
- "step": 1001
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4571,
- "step": 1002
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7721,
- "step": 1003
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5108,
- "step": 1004
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3813,
- "step": 1005
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.7963,
- "step": 1006
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1101,
- "step": 1007
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.021,
- "step": 1008
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5916,
- "step": 1009
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8813,
- "step": 1010
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1193,
- "step": 1011
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5092,
- "step": 1012
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8569,
- "step": 1013
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.119,
- "step": 1014
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3247,
- "step": 1015
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2358,
- "step": 1016
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2795,
- "step": 1017
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3466,
- "step": 1018
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5443,
- "step": 1019
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7296,
- "step": 1020
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0412,
- "step": 1021
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4829,
- "step": 1022
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7901,
- "step": 1023
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8077,
- "step": 1024
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4887,
- "step": 1025
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3095,
- "step": 1026
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3235,
- "step": 1027
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6315,
- "step": 1028
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4294,
- "step": 1029
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8457,
- "step": 1030
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7583,
- "step": 1031
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3129,
- "step": 1032
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1832,
- "step": 1033
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1764,
- "step": 1034
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0101,
- "step": 1035
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6524,
- "step": 1036
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2825,
- "step": 1037
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2262,
- "step": 1038
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2533,
- "step": 1039
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8794,
- "step": 1040
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7901,
- "step": 1041
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8351,
- "step": 1042
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5888,
- "step": 1043
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8932,
- "step": 1044
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2999,
- "step": 1045
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8396,
- "step": 1046
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4209,
- "step": 1047
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1524,
- "step": 1048
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7784,
- "step": 1049
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0179,
- "step": 1050
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1153,
- "step": 1051
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2149,
- "step": 1052
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0117,
- "step": 1053
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9693,
- "step": 1054
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5656,
- "step": 1055
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5,
- "step": 1056
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.102,
- "step": 1057
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3079,
- "step": 1058
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5754,
- "step": 1059
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6989,
- "step": 1060
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9597,
- "step": 1061
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3743,
- "step": 1062
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8887,
- "step": 1063
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3779,
- "step": 1064
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5001,
- "step": 1065
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4095,
- "step": 1066
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5073,
- "step": 1067
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1331,
- "step": 1068
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.323,
- "step": 1069
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6116,
- "step": 1070
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1212,
- "step": 1071
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0951,
- "step": 1072
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2463,
- "step": 1073
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4488,
- "step": 1074
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.279,
- "step": 1075
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5728,
- "step": 1076
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1362,
- "step": 1077
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6648,
- "step": 1078
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.427,
- "step": 1079
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8145,
- "step": 1080
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5308,
- "step": 1081
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.974,
- "step": 1082
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1965,
- "step": 1083
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8749,
- "step": 1084
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7352,
- "step": 1085
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7934,
- "step": 1086
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6003,
- "step": 1087
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5775,
- "step": 1088
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.519,
- "step": 1089
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7403,
- "step": 1090
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8145,
- "step": 1091
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5776,
- "step": 1092
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3753,
- "step": 1093
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9586,
- "step": 1094
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7263,
- "step": 1095
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7034,
- "step": 1096
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0579,
- "step": 1097
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8419,
- "step": 1098
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0751,
- "step": 1099
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6438,
- "step": 1100
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8744,
- "step": 1101
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4992,
- "step": 1102
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8094,
- "step": 1103
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.162,
- "step": 1104
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8351,
- "step": 1105
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8845,
- "step": 1106
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1894,
- "step": 1107
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.8333,
- "step": 1108
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4226,
- "step": 1109
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0712,
- "step": 1110
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9981,
- "step": 1111
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5885,
- "step": 1112
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1915,
- "step": 1113
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8003,
- "step": 1114
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5566,
- "step": 1115
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4085,
- "step": 1116
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0793,
- "step": 1117
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0909,
- "step": 1118
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2273,
- "step": 1119
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8273,
- "step": 1120
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0231,
- "step": 1121
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7229,
- "step": 1122
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4479,
- "step": 1123
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 1124
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9038,
- "step": 1125
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2653,
- "step": 1126
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 1127
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3003,
- "step": 1128
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7853,
- "step": 1129
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9143,
- "step": 1130
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2573,
- "step": 1131
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7091,
- "step": 1132
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3372,
- "step": 1133
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4165,
- "step": 1134
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4422,
- "step": 1135
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7693,
- "step": 1136
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7802,
- "step": 1137
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7263,
- "step": 1138
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6749,
- "step": 1139
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9459,
- "step": 1140
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9697,
- "step": 1141
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4506,
- "step": 1142
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5099,
- "step": 1143
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1475,
- "step": 1144
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3769,
- "step": 1145
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2035,
- "step": 1146
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6017,
- "step": 1147
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.463,
- "step": 1148
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3844,
- "step": 1149
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5306,
- "step": 1150
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5502,
- "step": 1151
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7394,
- "step": 1152
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5626,
- "step": 1153
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1618,
- "step": 1154
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5174,
- "step": 1155
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1038,
- "step": 1156
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3789,
- "step": 1157
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2985,
- "step": 1158
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4763,
- "step": 1159
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5071,
- "step": 1160
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0827,
- "step": 1161
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7349,
- "step": 1162
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.798,
- "step": 1163
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3176,
- "step": 1164
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8114,
- "step": 1165
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3379,
- "step": 1166
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1157,
- "step": 1167
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4675,
- "step": 1168
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2721,
- "step": 1169
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0603,
- "step": 1170
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6358,
- "step": 1171
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0865,
- "step": 1172
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.709,
- "step": 1173
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7705,
- "step": 1174
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7677,
- "step": 1175
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2418,
- "step": 1176
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7114,
- "step": 1177
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1165,
- "step": 1178
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9654,
- "step": 1179
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0672,
- "step": 1180
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1738,
- "step": 1181
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7604,
- "step": 1182
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 1183
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0231,
- "step": 1184
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2938,
- "step": 1185
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.783,
- "step": 1186
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3328,
- "step": 1187
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.321,
- "step": 1188
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6368,
- "step": 1189
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.101,
- "step": 1190
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6777,
- "step": 1191
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0831,
- "step": 1192
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5853,
- "step": 1193
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7923,
- "step": 1194
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3734,
- "step": 1195
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4268,
- "step": 1196
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6796,
- "step": 1197
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9028,
- "step": 1198
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3716,
- "step": 1199
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6761,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.9188361167907715,
- "eval_runtime": 22.426,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 3.3686839294433595,
- "step": 1200
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8855,
- "step": 1201
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8206,
- "step": 1202
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4401,
- "step": 1203
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2366,
- "step": 1204
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9885,
- "step": 1205
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5444,
- "step": 1206
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4259,
- "step": 1207
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5369,
- "step": 1208
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0839,
- "step": 1209
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7622,
- "step": 1210
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8979,
- "step": 1211
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5508,
- "step": 1212
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6439,
- "step": 1213
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6249,
- "step": 1214
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.495,
- "step": 1215
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0642,
- "step": 1216
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8997,
- "step": 1217
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6275,
- "step": 1218
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3317,
- "step": 1219
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4635,
- "step": 1220
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5197,
- "step": 1221
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5928,
- "step": 1222
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2363,
- "step": 1223
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0266,
- "step": 1224
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3356,
- "step": 1225
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7927,
- "step": 1226
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6952,
- "step": 1227
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8878,
- "step": 1228
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7472,
- "step": 1229
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6454,
- "step": 1230
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4972,
- "step": 1231
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.3347,
- "step": 1232
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1631,
- "step": 1233
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4708,
- "step": 1234
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5697,
- "step": 1235
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8218,
- "step": 1236
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.269,
- "step": 1237
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4165,
- "step": 1238
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3653,
- "step": 1239
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0152,
- "step": 1240
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9157,
- "step": 1241
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4086,
- "step": 1242
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2493,
- "step": 1243
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8279,
- "step": 1244
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6649,
- "step": 1245
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4405,
- "step": 1246
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1992,
- "step": 1247
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2055,
- "step": 1248
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4395,
- "step": 1249
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2475,
- "step": 1250
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8205,
- "step": 1251
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1053,
- "step": 1252
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7494,
- "step": 1253
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7387,
- "step": 1254
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8983,
- "step": 1255
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5614,
- "step": 1256
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7617,
- "step": 1257
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2445,
- "step": 1258
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3043,
- "step": 1259
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4214,
- "step": 1260
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1384,
- "step": 1261
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3914,
- "step": 1262
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3287,
- "step": 1263
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2174,
- "step": 1264
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4397,
- "step": 1265
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6875,
- "step": 1266
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4512,
- "step": 1267
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2834,
- "step": 1268
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7651,
- "step": 1269
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9263,
- "step": 1270
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6721,
- "step": 1271
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9178,
- "step": 1272
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7967,
- "step": 1273
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5242,
- "step": 1274
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7794,
- "step": 1275
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4256,
- "step": 1276
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5788,
- "step": 1277
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7586,
- "step": 1278
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.964,
- "step": 1279
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0749,
- "step": 1280
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6248,
- "step": 1281
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2465,
- "step": 1282
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1591,
- "step": 1283
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4328,
- "step": 1284
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.534,
- "step": 1285
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.523,
- "step": 1286
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5672,
- "step": 1287
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9162,
- "step": 1288
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1089,
- "step": 1289
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3287,
- "step": 1290
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2499,
- "step": 1291
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9645,
- "step": 1292
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3903,
- "step": 1293
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.5322,
- "step": 1294
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2211,
- "step": 1295
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2788,
- "step": 1296
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1862,
- "step": 1297
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2678,
- "step": 1298
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5399,
- "step": 1299
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7935,
- "step": 1300
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0391,
- "step": 1301
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1049,
- "step": 1302
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.365,
- "step": 1303
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.8809,
- "step": 1304
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2335,
- "step": 1305
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.5135,
- "step": 1306
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2378,
- "step": 1307
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9265,
- "step": 1308
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.641,
- "step": 1309
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9822,
- "step": 1310
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3369,
- "step": 1311
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3735,
- "step": 1312
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2618,
- "step": 1313
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6854,
- "step": 1314
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3748,
- "step": 1315
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9206,
- "step": 1316
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1969,
- "step": 1317
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1245,
- "step": 1318
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9977,
- "step": 1319
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5319,
- "step": 1320
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4431,
- "step": 1321
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7264,
- "step": 1322
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.05,
- "step": 1323
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3118,
- "step": 1324
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4575,
- "step": 1325
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.593,
- "step": 1326
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0061,
- "step": 1327
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2481,
- "step": 1328
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8017,
- "step": 1329
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8617,
- "step": 1330
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7036,
- "step": 1331
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0091,
- "step": 1332
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9687,
- "step": 1333
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3925,
- "step": 1334
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1127,
- "step": 1335
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8163,
- "step": 1336
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0639,
- "step": 1337
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8929,
- "step": 1338
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5011,
- "step": 1339
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.033,
- "step": 1340
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0526,
- "step": 1341
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4428,
- "step": 1342
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3477,
- "step": 1343
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.881,
- "step": 1344
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.5276,
- "step": 1345
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4183,
- "step": 1346
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4943,
- "step": 1347
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9187,
- "step": 1348
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1003,
- "step": 1349
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1187,
- "step": 1350
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8081,
- "step": 1351
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4695,
- "step": 1352
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.5761,
- "step": 1353
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9635,
- "step": 1354
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2133,
- "step": 1355
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2611,
- "step": 1356
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.6885,
- "step": 1357
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1157,
- "step": 1358
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4421,
- "step": 1359
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2128,
- "step": 1360
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6978,
- "step": 1361
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9804,
- "step": 1362
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.3426,
- "step": 1363
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2676,
- "step": 1364
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.325,
- "step": 1365
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1263,
- "step": 1366
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7481,
- "step": 1367
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6891,
- "step": 1368
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8568,
- "step": 1369
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9893,
- "step": 1370
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0302,
- "step": 1371
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3204,
- "step": 1372
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9008,
- "step": 1373
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2624,
- "step": 1374
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6234,
- "step": 1375
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2286,
- "step": 1376
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3426,
- "step": 1377
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1962,
- "step": 1378
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3142,
- "step": 1379
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.414,
- "step": 1380
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0191,
- "step": 1381
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4953,
- "step": 1382
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6694,
- "step": 1383
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8611,
- "step": 1384
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.86,
- "step": 1385
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6519,
- "step": 1386
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.394,
- "step": 1387
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2117,
- "step": 1388
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9924,
- "step": 1389
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.58,
- "step": 1390
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4415,
- "step": 1391
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7196,
- "step": 1392
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7388,
- "step": 1393
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4784,
- "step": 1394
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.496,
- "step": 1395
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8706,
- "step": 1396
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1858,
- "step": 1397
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9038,
- "step": 1398
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4852,
- "step": 1399
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2717,
- "step": 1400
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.97923469543457,
- "eval_runtime": 22.472,
- "eval_samples_per_second": 2.225,
- "eval_steps_per_second": 1.112,
- "step": 1400
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 3.657382688522339,
- "step": 1400
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.843,
- "step": 1401
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5611,
- "step": 1402
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2434,
- "step": 1403
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3136,
- "step": 1404
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.686,
- "step": 1405
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6365,
- "step": 1406
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.1811,
- "step": 1407
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7537,
- "step": 1408
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2949,
- "step": 1409
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4827,
- "step": 1410
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0965,
- "step": 1411
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.007,
- "step": 1412
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2861,
- "step": 1413
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1774,
- "step": 1414
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7777,
- "step": 1415
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0259,
- "step": 1416
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9024,
- "step": 1417
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4786,
- "step": 1418
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5873,
- "step": 1419
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2744,
- "step": 1420
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9484,
- "step": 1421
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2093,
- "step": 1422
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3394,
- "step": 1423
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1247,
- "step": 1424
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0691,
- "step": 1425
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.559,
- "step": 1426
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1518,
- "step": 1427
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4143,
- "step": 1428
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0287,
- "step": 1429
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8112,
- "step": 1430
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2625,
- "step": 1431
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3528,
- "step": 1432
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2715,
- "step": 1433
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7849,
- "step": 1434
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2002,
- "step": 1435
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0658,
- "step": 1436
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0671,
- "step": 1437
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2577,
- "step": 1438
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.803,
- "step": 1439
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2974,
- "step": 1440
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0897,
- "step": 1441
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0805,
- "step": 1442
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7681,
- "step": 1443
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6565,
- "step": 1444
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0174,
- "step": 1445
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8507,
- "step": 1446
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2105,
- "step": 1447
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.572,
- "step": 1448
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2904,
- "step": 1449
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4623,
- "step": 1450
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4774,
- "step": 1451
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1277,
- "step": 1452
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6204,
- "step": 1453
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3219,
- "step": 1454
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2517,
- "step": 1455
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.3026,
- "step": 1456
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4016,
- "step": 1457
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5256,
- "step": 1458
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9316,
- "step": 1459
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.631,
- "step": 1460
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2888,
- "step": 1461
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5511,
- "step": 1462
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.9799,
- "step": 1463
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6982,
- "step": 1464
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4923,
- "step": 1465
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8329,
- "step": 1466
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2733,
- "step": 1467
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8221,
- "step": 1468
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.363,
- "step": 1469
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6348,
- "step": 1470
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3319,
- "step": 1471
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6768,
- "step": 1472
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.1985,
- "step": 1473
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6109,
- "step": 1474
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.974,
- "step": 1475
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8902,
- "step": 1476
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6762,
- "step": 1477
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8541,
- "step": 1478
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3867,
- "step": 1479
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9624,
- "step": 1480
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8768,
- "step": 1481
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7598,
- "step": 1482
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6522,
- "step": 1483
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8156,
- "step": 1484
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.3791,
- "step": 1485
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2178,
- "step": 1486
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8448,
- "step": 1487
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5377,
- "step": 1488
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7407,
- "step": 1489
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7636,
- "step": 1490
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4325,
- "step": 1491
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8966,
- "step": 1492
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0626,
- "step": 1493
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.255,
- "step": 1494
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2802,
- "step": 1495
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.894,
- "step": 1496
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6482,
- "step": 1497
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8903,
- "step": 1498
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8672,
- "step": 1499
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6079,
- "step": 1500
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6217,
- "step": 1501
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2361,
- "step": 1502
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3962,
- "step": 1503
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0036,
- "step": 1504
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5926,
- "step": 1505
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.114,
- "step": 1506
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4419,
- "step": 1507
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7838,
- "step": 1508
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6635,
- "step": 1509
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2906,
- "step": 1510
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4186,
- "step": 1511
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4783,
- "step": 1512
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1226,
- "step": 1513
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2458,
- "step": 1514
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5302,
- "step": 1515
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1515,
- "step": 1516
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4182,
- "step": 1517
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8248,
- "step": 1518
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2349,
- "step": 1519
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9314,
- "step": 1520
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1161,
- "step": 1521
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4183,
- "step": 1522
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4454,
- "step": 1523
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5588,
- "step": 1524
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8026,
- "step": 1525
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.7695,
- "step": 1526
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3636,
- "step": 1527
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2776,
- "step": 1528
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5386,
- "step": 1529
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.521,
- "step": 1530
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8388,
- "step": 1531
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3561,
- "step": 1532
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9606,
- "step": 1533
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9438,
- "step": 1534
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7665,
- "step": 1535
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5826,
- "step": 1536
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.0798,
- "step": 1537
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8545,
- "step": 1538
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.302,
- "step": 1539
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1092,
- "step": 1540
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.5021,
- "step": 1541
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9384,
- "step": 1542
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8761,
- "step": 1543
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3316,
- "step": 1544
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.2051,
- "step": 1545
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7907,
- "step": 1546
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2534,
- "step": 1547
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2274,
- "step": 1548
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9226,
- "step": 1549
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2502,
- "step": 1550
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2703,
- "step": 1551
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4359,
- "step": 1552
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.128,
- "step": 1553
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3147,
- "step": 1554
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.026,
- "step": 1555
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9393,
- "step": 1556
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7753,
- "step": 1557
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.9049,
- "step": 1558
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0538,
- "step": 1559
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8691,
- "step": 1560
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9377,
- "step": 1561
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8385,
- "step": 1562
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.939,
- "step": 1563
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.727,
- "step": 1564
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7866,
- "step": 1565
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2439,
- "step": 1566
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9607,
- "step": 1567
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3505,
- "step": 1568
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7729,
- "step": 1569
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4669,
- "step": 1570
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8178,
- "step": 1571
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2173,
- "step": 1572
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2136,
- "step": 1573
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.2888,
- "step": 1574
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0386,
- "step": 1575
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9041,
- "step": 1576
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7544,
- "step": 1577
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.3229,
- "step": 1578
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4203,
- "step": 1579
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.497,
- "step": 1580
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8253,
- "step": 1581
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.0801,
- "step": 1582
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1585,
- "step": 1583
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6965,
- "step": 1584
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.498,
- "step": 1585
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.8697,
- "step": 1586
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2663,
- "step": 1587
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7004,
- "step": 1588
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6561,
- "step": 1589
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.785,
- "step": 1590
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5708,
- "step": 1591
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.326,
- "step": 1592
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.2974,
- "step": 1593
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1408,
- "step": 1594
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6526,
- "step": 1595
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4116,
- "step": 1596
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0484,
- "step": 1597
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3162,
- "step": 1598
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.3806,
- "step": 1599
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0251,
- "step": 1600
- },
- {
- "epoch": 0.01,
- "eval_loss": 6.617897987365723,
- "eval_runtime": 22.4646,
- "eval_samples_per_second": 2.226,
- "eval_steps_per_second": 1.113,
- "step": 1600
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 4.160770101547241,
- "step": 1600
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 2.715294108224717e+16,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-1600/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011
diff --git a/checkpoint-1800/README.md b/checkpoint-1800/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-1800/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-1800/adapter_config.json b/checkpoint-1800/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-1800/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-1800/adapter_model.bin b/checkpoint-1800/adapter_model.bin
deleted file mode 100644
index 408a253b750c2b59e2e96bb21b2ce602d49633b6..0000000000000000000000000000000000000000
--- a/checkpoint-1800/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3c61e805254e0a600e748d930ef8c72bc26fcde3b479c3cfa35931c18f619f86
-size 871609293
diff --git a/checkpoint-1800/added_tokens.json b/checkpoint-1800/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-1800/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-1800/optimizer.pt b/checkpoint-1800/optimizer.pt
deleted file mode 100644
index 3fb313f29c7314d19b6c27a840d1dd7c1aa70c9a..0000000000000000000000000000000000000000
--- a/checkpoint-1800/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:19d9bbef27f39dbe487933f28d7202a46a46790a115df33e78e6bd1954a418f9
-size 873873439
diff --git a/checkpoint-1800/rng_state.pth b/checkpoint-1800/rng_state.pth
deleted file mode 100644
index d8ca8535035fc59f325b4328b9bb71d7c508265c..0000000000000000000000000000000000000000
--- a/checkpoint-1800/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a3a9d5f030044d7c1c38e9af8d159736e36882df4a51d621092d38392dff14ee
-size 14511
diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt
deleted file mode 100644
index 6b59b53fb55fec134adf9a1e87da4c3bc5d074fe..0000000000000000000000000000000000000000
--- a/checkpoint-1800/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4b5171ec9320c39e7e6817f1736e6a4c3dc5a7837db85e0bdb59924fe3c72ecf
-size 627
diff --git a/checkpoint-1800/special_tokens_map.json b/checkpoint-1800/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-1800/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-1800/tokenizer.model b/checkpoint-1800/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-1800/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-1800/tokenizer_config.json b/checkpoint-1800/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-1800/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-1800/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011
diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-200/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-200/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-200/adapter_model.bin b/checkpoint-200/adapter_model.bin
deleted file mode 100644
index 46d6f050f0c1f5d338463dafa05a5f3475cc65e3..0000000000000000000000000000000000000000
--- a/checkpoint-200/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e24cde49bbdbdfa926b63cf5ddeacb885f909f0185ff48bd760e1a2e8925df52
-size 871609293
diff --git a/checkpoint-200/adapter_model/adapter_model/README.md b/checkpoint-200/adapter_model/adapter_model/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-200/adapter_model/adapter_model/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-200/adapter_model/adapter_model/adapter_config.json b/checkpoint-200/adapter_model/adapter_model/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-200/adapter_model/adapter_model/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-200/adapter_model/adapter_model/adapter_model.bin b/checkpoint-200/adapter_model/adapter_model/adapter_model.bin
deleted file mode 100644
index 46d6f050f0c1f5d338463dafa05a5f3475cc65e3..0000000000000000000000000000000000000000
--- a/checkpoint-200/adapter_model/adapter_model/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e24cde49bbdbdfa926b63cf5ddeacb885f909f0185ff48bd760e1a2e8925df52
-size 871609293
diff --git a/checkpoint-200/added_tokens.json b/checkpoint-200/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-200/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt
deleted file mode 100644
index 4e7cefc561cc084f74e3da34d9a8b0c00bd900a3..0000000000000000000000000000000000000000
--- a/checkpoint-200/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:18b6ed2bfd425ca4415dfeeaa7f036c1e535870d30a4b8dc72be98acf380dab9
-size 873872799
diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth
deleted file mode 100644
index 17a28c1038a0a3c91b157fc0e9f9bbc5553386fb..0000000000000000000000000000000000000000
--- a/checkpoint-200/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cc32dcae9e98100dc265bc4ad63fea6daed5748c1556b90bee271aac09744907
-size 14511
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
deleted file mode 100644
index 8c7ab25b6fd9c213809864a48f899211804e88f7..0000000000000000000000000000000000000000
--- a/checkpoint-200/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:81248501833af563175f43c1d681185643b8411cee1fb1e631b8687c465eb2e3
-size 627
diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-200/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-200/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-200/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
deleted file mode 100644
index bc3a0a0575c658f22c17c549f56b9a921c89d07c..0000000000000000000000000000000000000000
--- a/checkpoint-200/trainer_state.json
+++ /dev/null
@@ -1,1234 +0,0 @@
-{
- "best_metric": 7.883856773376465,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-200",
- "epoch": 0.0015277671682835535,
- "global_step": 200,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 3381201485070336.0,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-200/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011
diff --git a/checkpoint-1000/README.md b/checkpoint-2200/adapter_model/adapter_model/README.md
similarity index 100%
rename from checkpoint-1000/README.md
rename to checkpoint-2200/adapter_model/adapter_model/README.md
diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-2200/adapter_model/adapter_model/adapter_config.json
similarity index 100%
rename from checkpoint-1000/adapter_config.json
rename to checkpoint-2200/adapter_model/adapter_model/adapter_config.json
diff --git a/checkpoint-2200/adapter_model/adapter_model/adapter_model.bin b/checkpoint-2200/adapter_model/adapter_model/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17a5d9a4024f623f507a7c923ee385b59403ab9b
--- /dev/null
+++ b/checkpoint-2200/adapter_model/adapter_model/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18f11fbc4708b106870eec7154c2b9bbcad7ba5b185b2bacd1b7a7c4926deed7
+size 871609293
diff --git a/checkpoint-1200/README.md b/checkpoint-2400/README.md
similarity index 100%
rename from checkpoint-1200/README.md
rename to checkpoint-2400/README.md
diff --git a/checkpoint-1000/adapter_model/adapter_model/adapter_config.json b/checkpoint-2400/adapter_config.json
similarity index 100%
rename from checkpoint-1000/adapter_model/adapter_model/adapter_config.json
rename to checkpoint-2400/adapter_config.json
diff --git a/checkpoint-2400/adapter_model.bin b/checkpoint-2400/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aa38a64bfc3a8cb8c55c290beaf3783f62c8da4e
--- /dev/null
+++ b/checkpoint-2400/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23824721820d37d6fe44fee9306d0e71a5826aebaf3eb2f970cab6872288b55a
+size 871609293
diff --git a/checkpoint-1000/added_tokens.json b/checkpoint-2400/added_tokens.json
similarity index 100%
rename from checkpoint-1000/added_tokens.json
rename to checkpoint-2400/added_tokens.json
diff --git a/checkpoint-2400/optimizer.pt b/checkpoint-2400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b18659185211a65315480e095ce60e088bb764bf
--- /dev/null
+++ b/checkpoint-2400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e124e64b77531d5f98edc3d5e51a56228b9c1bccf94fbc775628676a06fb976
+size 873873439
diff --git a/checkpoint-2400/rng_state.pth b/checkpoint-2400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2d0f2264ea7662abcadfb8caac1c1afa09fe0b4e
--- /dev/null
+++ b/checkpoint-2400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb0e855be1f0b0e57ff0dd7ae4a8185049253a39a36749acd9a9b1af0d3ab306
+size 14511
diff --git a/checkpoint-2400/scheduler.pt b/checkpoint-2400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8346177cbadcccf082c60229d4e2d0d00c246e9d
--- /dev/null
+++ b/checkpoint-2400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cf2acca1029437820e5d9cd9f7ccc6efd7468e812d0f38078e4079d268163c1
+size 627
diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-2400/special_tokens_map.json
similarity index 100%
rename from checkpoint-1000/special_tokens_map.json
rename to checkpoint-2400/special_tokens_map.json
diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-2400/tokenizer.model
similarity index 100%
rename from checkpoint-1000/tokenizer.model
rename to checkpoint-2400/tokenizer.model
diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-2400/tokenizer_config.json
similarity index 100%
rename from checkpoint-1000/tokenizer_config.json
rename to checkpoint-2400/tokenizer_config.json
diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-2400/trainer_state.json
similarity index 74%
rename from checkpoint-1800/trainer_state.json
rename to checkpoint-2400/trainer_state.json
index 48264b9e81dcadada3a64de403682a3652b8b503..b3ad1e92c3e098aaf21f9adeeb09ccd7c376158a 100644
--- a/checkpoint-1800/trainer_state.json
+++ b/checkpoint-2400/trainer_state.json
@@ -1,8 +1,8 @@
{
- "best_metric": 6.617897987365723,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-1600",
- "epoch": 0.013749904514551983,
- "global_step": 1800,
+ "best_metric": 6.580160140991211,
+ "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-2200",
+ "epoch": 0.018333206019402644,
+ "global_step": 2400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
@@ -10968,11 +10968,3665 @@
"mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
"mmlu_loss": 4.238778591156006,
"step": 1800
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.1537,
+ "step": 1801
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.684,
+ "step": 1802
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.7862,
+ "step": 1803
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.3518,
+ "step": 1804
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.1795,
+ "step": 1805
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.0054,
+ "step": 1806
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.8997,
+ "step": 1807
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.9002,
+ "step": 1808
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.2805,
+ "step": 1809
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.1203,
+ "step": 1810
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.0206,
+ "step": 1811
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.0151,
+ "step": 1812
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.3864,
+ "step": 1813
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.1117,
+ "step": 1814
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.8487,
+ "step": 1815
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.59,
+ "step": 1816
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.1615,
+ "step": 1817
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.7362,
+ "step": 1818
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.2294,
+ "step": 1819
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.5622,
+ "step": 1820
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.5437,
+ "step": 1821
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.093,
+ "step": 1822
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.0343,
+ "step": 1823
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.4454,
+ "step": 1824
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.5138,
+ "step": 1825
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.5605,
+ "step": 1826
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.322,
+ "step": 1827
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.6489,
+ "step": 1828
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.331,
+ "step": 1829
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.6462,
+ "step": 1830
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.932,
+ "step": 1831
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.9058,
+ "step": 1832
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.3433,
+ "step": 1833
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.4365,
+ "step": 1834
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.3282,
+ "step": 1835
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.448,
+ "step": 1836
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.5369,
+ "step": 1837
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.177,
+ "step": 1838
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.3552,
+ "step": 1839
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.4568,
+ "step": 1840
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.0602,
+ "step": 1841
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.7449,
+ "step": 1842
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.2675,
+ "step": 1843
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.0317,
+ "step": 1844
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.4342,
+ "step": 1845
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.8688,
+ "step": 1846
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.3571,
+ "step": 1847
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.3776,
+ "step": 1848
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.2248,
+ "step": 1849
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.6073,
+ "step": 1850
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.8425,
+ "step": 1851
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.5954,
+ "step": 1852
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.4197,
+ "step": 1853
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.8624,
+ "step": 1854
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.9652,
+ "step": 1855
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.7145,
+ "step": 1856
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.5309,
+ "step": 1857
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.4356,
+ "step": 1858
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.6508,
+ "step": 1859
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.0955,
+ "step": 1860
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.6886,
+ "step": 1861
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.7644,
+ "step": 1862
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.5709,
+ "step": 1863
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.6212,
+ "step": 1864
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.6325,
+ "step": 1865
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.6805,
+ "step": 1866
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.1464,
+ "step": 1867
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.9244,
+ "step": 1868
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.336,
+ "step": 1869
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.8783,
+ "step": 1870
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.8236,
+ "step": 1871
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.084,
+ "step": 1872
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.9639,
+ "step": 1873
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.4173,
+ "step": 1874
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.0042,
+ "step": 1875
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.2519,
+ "step": 1876
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.4656,
+ "step": 1877
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.5136,
+ "step": 1878
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.3918,
+ "step": 1879
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.9696,
+ "step": 1880
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.9736,
+ "step": 1881
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.6192,
+ "step": 1882
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.3476,
+ "step": 1883
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.3048,
+ "step": 1884
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.1116,
+ "step": 1885
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.971,
+ "step": 1886
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.0741,
+ "step": 1887
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.1418,
+ "step": 1888
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.3487,
+ "step": 1889
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.38,
+ "step": 1890
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.6561,
+ "step": 1891
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.5606,
+ "step": 1892
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.8623,
+ "step": 1893
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.2984,
+ "step": 1894
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.6179,
+ "step": 1895
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.8625,
+ "step": 1896
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.8596,
+ "step": 1897
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.7205,
+ "step": 1898
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.6727,
+ "step": 1899
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.016,
+ "step": 1900
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.9868,
+ "step": 1901
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.632,
+ "step": 1902
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.5133,
+ "step": 1903
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.7476,
+ "step": 1904
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.4174,
+ "step": 1905
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.6789,
+ "step": 1906
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.4534,
+ "step": 1907
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.3335,
+ "step": 1908
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.7921,
+ "step": 1909
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.9567,
+ "step": 1910
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.1739,
+ "step": 1911
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.7514,
+ "step": 1912
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.3858,
+ "step": 1913
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.0462,
+ "step": 1914
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.3817,
+ "step": 1915
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 1.9739,
+ "step": 1916
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.1122,
+ "step": 1917
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.3361,
+ "step": 1918
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.3184,
+ "step": 1919
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.7342,
+ "step": 1920
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.375,
+ "step": 1921
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.6841,
+ "step": 1922
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.0773,
+ "step": 1923
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.8916,
+ "step": 1924
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.7176,
+ "step": 1925
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.8841,
+ "step": 1926
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.8345,
+ "step": 1927
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.561,
+ "step": 1928
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.5392,
+ "step": 1929
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.1627,
+ "step": 1930
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.0657,
+ "step": 1931
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.7385,
+ "step": 1932
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.5533,
+ "step": 1933
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.0925,
+ "step": 1934
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.8752,
+ "step": 1935
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.4039,
+ "step": 1936
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.6472,
+ "step": 1937
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.1819,
+ "step": 1938
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.5919,
+ "step": 1939
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.6527,
+ "step": 1940
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.5188,
+ "step": 1941
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.9856,
+ "step": 1942
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.7038,
+ "step": 1943
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.911,
+ "step": 1944
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.497,
+ "step": 1945
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.1804,
+ "step": 1946
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 2.3949,
+ "step": 1947
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 6.0433,
+ "step": 1948
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.4706,
+ "step": 1949
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.5896,
+ "step": 1950
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.557,
+ "step": 1951
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.34,
+ "step": 1952
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.7865,
+ "step": 1953
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 8.0797,
+ "step": 1954
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.2896,
+ "step": 1955
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.4096,
+ "step": 1956
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 3.9538,
+ "step": 1957
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.2778,
+ "step": 1958
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 7.4968,
+ "step": 1959
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.8328,
+ "step": 1960
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 9.4597,
+ "step": 1961
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 4.6776,
+ "step": 1962
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 0.0004,
+ "loss": 5.4861,
+ "step": 1963
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.5831,
+ "step": 1964
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.4585,
+ "step": 1965
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.7898,
+ "step": 1966
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8714,
+ "step": 1967
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.752,
+ "step": 1968
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.9024,
+ "step": 1969
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.058,
+ "step": 1970
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1745,
+ "step": 1971
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.2162,
+ "step": 1972
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.2668,
+ "step": 1973
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.3307,
+ "step": 1974
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.3285,
+ "step": 1975
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1353,
+ "step": 1976
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8069,
+ "step": 1977
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.6885,
+ "step": 1978
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.5946,
+ "step": 1979
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6828,
+ "step": 1980
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.6516,
+ "step": 1981
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.261,
+ "step": 1982
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.524,
+ "step": 1983
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.585,
+ "step": 1984
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.8883,
+ "step": 1985
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.689,
+ "step": 1986
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.1083,
+ "step": 1987
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.1606,
+ "step": 1988
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9243,
+ "step": 1989
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.6597,
+ "step": 1990
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2849,
+ "step": 1991
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3715,
+ "step": 1992
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.7262,
+ "step": 1993
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.6862,
+ "step": 1994
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.5412,
+ "step": 1995
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.7483,
+ "step": 1996
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.3391,
+ "step": 1997
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.2642,
+ "step": 1998
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1519,
+ "step": 1999
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.7098,
+ "step": 2000
+ },
+ {
+ "epoch": 0.02,
+ "eval_loss": 6.762476921081543,
+ "eval_runtime": 22.4899,
+ "eval_samples_per_second": 2.223,
+ "eval_steps_per_second": 1.112,
+ "step": 2000
+ },
+ {
+ "epoch": 0.02,
+ "mmlu_eval_accuracy": 0.2525477994227994,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
+ "mmlu_eval_accuracy_astronomy": 0.3125,
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
+ "mmlu_loss": 3.4606559085845947,
+ "step": 2000
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.8099,
+ "step": 2001
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0567,
+ "step": 2002
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.2981,
+ "step": 2003
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.2668,
+ "step": 2004
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.139,
+ "step": 2005
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.903,
+ "step": 2006
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.2182,
+ "step": 2007
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.2347,
+ "step": 2008
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.8383,
+ "step": 2009
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.0211,
+ "step": 2010
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2572,
+ "step": 2011
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.2877,
+ "step": 2012
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.3577,
+ "step": 2013
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.2022,
+ "step": 2014
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2722,
+ "step": 2015
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0552,
+ "step": 2016
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.9857,
+ "step": 2017
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0519,
+ "step": 2018
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.7118,
+ "step": 2019
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.4465,
+ "step": 2020
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.3009,
+ "step": 2021
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.3614,
+ "step": 2022
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3493,
+ "step": 2023
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.34,
+ "step": 2024
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0416,
+ "step": 2025
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.686,
+ "step": 2026
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.6021,
+ "step": 2027
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.4161,
+ "step": 2028
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.0029,
+ "step": 2029
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 1.8579,
+ "step": 2030
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.0247,
+ "step": 2031
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4184,
+ "step": 2032
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.4962,
+ "step": 2033
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.5137,
+ "step": 2034
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.6692,
+ "step": 2035
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.7161,
+ "step": 2036
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.617,
+ "step": 2037
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.413,
+ "step": 2038
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3056,
+ "step": 2039
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.9441,
+ "step": 2040
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.9943,
+ "step": 2041
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.5703,
+ "step": 2042
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.1881,
+ "step": 2043
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5763,
+ "step": 2044
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.6389,
+ "step": 2045
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1717,
+ "step": 2046
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5482,
+ "step": 2047
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.9469,
+ "step": 2048
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7685,
+ "step": 2049
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.1381,
+ "step": 2050
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6961,
+ "step": 2051
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6425,
+ "step": 2052
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5354,
+ "step": 2053
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2404,
+ "step": 2054
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.1556,
+ "step": 2055
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.7133,
+ "step": 2056
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.8166,
+ "step": 2057
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.5071,
+ "step": 2058
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5429,
+ "step": 2059
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.0367,
+ "step": 2060
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.5386,
+ "step": 2061
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.5899,
+ "step": 2062
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2968,
+ "step": 2063
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9951,
+ "step": 2064
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.8608,
+ "step": 2065
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.4735,
+ "step": 2066
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5612,
+ "step": 2067
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.7461,
+ "step": 2068
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.5887,
+ "step": 2069
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.3426,
+ "step": 2070
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.5589,
+ "step": 2071
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.498,
+ "step": 2072
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1306,
+ "step": 2073
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.3492,
+ "step": 2074
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.2311,
+ "step": 2075
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.8798,
+ "step": 2076
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.6799,
+ "step": 2077
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.5011,
+ "step": 2078
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8892,
+ "step": 2079
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.6449,
+ "step": 2080
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.9117,
+ "step": 2081
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1157,
+ "step": 2082
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.196,
+ "step": 2083
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.9364,
+ "step": 2084
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.3618,
+ "step": 2085
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.3755,
+ "step": 2086
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.4564,
+ "step": 2087
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.4912,
+ "step": 2088
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.113,
+ "step": 2089
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.0588,
+ "step": 2090
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.668,
+ "step": 2091
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.08,
+ "step": 2092
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.2042,
+ "step": 2093
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4134,
+ "step": 2094
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.0456,
+ "step": 2095
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.2245,
+ "step": 2096
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4936,
+ "step": 2097
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5158,
+ "step": 2098
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.7269,
+ "step": 2099
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.7077,
+ "step": 2100
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.6008,
+ "step": 2101
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4652,
+ "step": 2102
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 1.918,
+ "step": 2103
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.5819,
+ "step": 2104
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.7764,
+ "step": 2105
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.0525,
+ "step": 2106
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.5359,
+ "step": 2107
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.4925,
+ "step": 2108
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.4857,
+ "step": 2109
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 1.9445,
+ "step": 2110
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.8494,
+ "step": 2111
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1513,
+ "step": 2112
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.2552,
+ "step": 2113
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.7229,
+ "step": 2114
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.8571,
+ "step": 2115
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.5968,
+ "step": 2116
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.8806,
+ "step": 2117
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.4641,
+ "step": 2118
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.6039,
+ "step": 2119
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1379,
+ "step": 2120
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6688,
+ "step": 2121
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.293,
+ "step": 2122
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5664,
+ "step": 2123
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0825,
+ "step": 2124
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9788,
+ "step": 2125
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.9641,
+ "step": 2126
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.7799,
+ "step": 2127
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0619,
+ "step": 2128
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0022,
+ "step": 2129
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.8022,
+ "step": 2130
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.5301,
+ "step": 2131
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.681,
+ "step": 2132
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.7362,
+ "step": 2133
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5462,
+ "step": 2134
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.2356,
+ "step": 2135
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.2259,
+ "step": 2136
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3646,
+ "step": 2137
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8765,
+ "step": 2138
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.6487,
+ "step": 2139
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.9622,
+ "step": 2140
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1761,
+ "step": 2141
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.6922,
+ "step": 2142
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.0371,
+ "step": 2143
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7869,
+ "step": 2144
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3725,
+ "step": 2145
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.8894,
+ "step": 2146
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.6083,
+ "step": 2147
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.4451,
+ "step": 2148
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1149,
+ "step": 2149
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.8058,
+ "step": 2150
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1308,
+ "step": 2151
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.1447,
+ "step": 2152
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.208,
+ "step": 2153
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5193,
+ "step": 2154
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.7729,
+ "step": 2155
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5019,
+ "step": 2156
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.6092,
+ "step": 2157
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1853,
+ "step": 2158
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.7,
+ "step": 2159
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.1638,
+ "step": 2160
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.762,
+ "step": 2161
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.7455,
+ "step": 2162
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.9372,
+ "step": 2163
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4569,
+ "step": 2164
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6705,
+ "step": 2165
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1988,
+ "step": 2166
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.2526,
+ "step": 2167
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.9066,
+ "step": 2168
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1365,
+ "step": 2169
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.3422,
+ "step": 2170
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.2691,
+ "step": 2171
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.9008,
+ "step": 2172
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.2555,
+ "step": 2173
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0886,
+ "step": 2174
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.0369,
+ "step": 2175
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5566,
+ "step": 2176
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2567,
+ "step": 2177
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.0179,
+ "step": 2178
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5383,
+ "step": 2179
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.4797,
+ "step": 2180
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0163,
+ "step": 2181
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.2658,
+ "step": 2182
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.1337,
+ "step": 2183
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.3287,
+ "step": 2184
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.7874,
+ "step": 2185
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7153,
+ "step": 2186
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.7037,
+ "step": 2187
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.4412,
+ "step": 2188
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.3912,
+ "step": 2189
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.034,
+ "step": 2190
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.4697,
+ "step": 2191
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.6243,
+ "step": 2192
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1133,
+ "step": 2193
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 1.9005,
+ "step": 2194
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7386,
+ "step": 2195
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4169,
+ "step": 2196
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8139,
+ "step": 2197
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.3012,
+ "step": 2198
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.8223,
+ "step": 2199
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.3757,
+ "step": 2200
+ },
+ {
+ "epoch": 0.02,
+ "eval_loss": 6.580160140991211,
+ "eval_runtime": 22.4971,
+ "eval_samples_per_second": 2.223,
+ "eval_steps_per_second": 1.111,
+ "step": 2200
+ },
+ {
+ "epoch": 0.02,
+ "mmlu_eval_accuracy": 0.2525477994227994,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
+ "mmlu_eval_accuracy_astronomy": 0.3125,
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
+ "mmlu_loss": 3.755114164352417,
+ "step": 2200
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5282,
+ "step": 2201
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2478,
+ "step": 2202
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.916,
+ "step": 2203
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5069,
+ "step": 2204
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5952,
+ "step": 2205
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.5059,
+ "step": 2206
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.7434,
+ "step": 2207
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.625,
+ "step": 2208
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.1674,
+ "step": 2209
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.3937,
+ "step": 2210
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.8783,
+ "step": 2211
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5263,
+ "step": 2212
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.7887,
+ "step": 2213
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8911,
+ "step": 2214
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.7211,
+ "step": 2215
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.089,
+ "step": 2216
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6373,
+ "step": 2217
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.7728,
+ "step": 2218
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6957,
+ "step": 2219
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.43,
+ "step": 2220
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9673,
+ "step": 2221
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.8942,
+ "step": 2222
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.2893,
+ "step": 2223
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1505,
+ "step": 2224
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3702,
+ "step": 2225
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1731,
+ "step": 2226
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.997,
+ "step": 2227
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.9531,
+ "step": 2228
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0748,
+ "step": 2229
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0642,
+ "step": 2230
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.9469,
+ "step": 2231
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2265,
+ "step": 2232
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.6461,
+ "step": 2233
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.064,
+ "step": 2234
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1414,
+ "step": 2235
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.5375,
+ "step": 2236
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.6348,
+ "step": 2237
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9975,
+ "step": 2238
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.5242,
+ "step": 2239
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.3179,
+ "step": 2240
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6054,
+ "step": 2241
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1832,
+ "step": 2242
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.0572,
+ "step": 2243
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.2049,
+ "step": 2244
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6348,
+ "step": 2245
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.67,
+ "step": 2246
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 1.5627,
+ "step": 2247
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.1851,
+ "step": 2248
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.6792,
+ "step": 2249
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.6344,
+ "step": 2250
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.7603,
+ "step": 2251
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.7737,
+ "step": 2252
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.5323,
+ "step": 2253
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.4059,
+ "step": 2254
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9343,
+ "step": 2255
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0156,
+ "step": 2256
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.1851,
+ "step": 2257
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.44,
+ "step": 2258
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9079,
+ "step": 2259
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.4982,
+ "step": 2260
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 12.3777,
+ "step": 2261
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 10.1265,
+ "step": 2262
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.1428,
+ "step": 2263
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8072,
+ "step": 2264
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.911,
+ "step": 2265
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.9453,
+ "step": 2266
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0168,
+ "step": 2267
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2098,
+ "step": 2268
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4417,
+ "step": 2269
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.8449,
+ "step": 2270
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.394,
+ "step": 2271
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7642,
+ "step": 2272
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.5555,
+ "step": 2273
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3576,
+ "step": 2274
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.386,
+ "step": 2275
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.6677,
+ "step": 2276
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.2385,
+ "step": 2277
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8074,
+ "step": 2278
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.2963,
+ "step": 2279
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3612,
+ "step": 2280
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.1837,
+ "step": 2281
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5882,
+ "step": 2282
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0968,
+ "step": 2283
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.2376,
+ "step": 2284
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.3835,
+ "step": 2285
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.0143,
+ "step": 2286
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.36,
+ "step": 2287
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.0121,
+ "step": 2288
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.0144,
+ "step": 2289
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6807,
+ "step": 2290
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.8854,
+ "step": 2291
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1727,
+ "step": 2292
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.533,
+ "step": 2293
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.9793,
+ "step": 2294
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.538,
+ "step": 2295
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.145,
+ "step": 2296
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.163,
+ "step": 2297
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.1045,
+ "step": 2298
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.0209,
+ "step": 2299
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.9728,
+ "step": 2300
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8902,
+ "step": 2301
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.3075,
+ "step": 2302
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 10.2194,
+ "step": 2303
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.7375,
+ "step": 2304
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.3863,
+ "step": 2305
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1317,
+ "step": 2306
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1878,
+ "step": 2307
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.6124,
+ "step": 2308
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8843,
+ "step": 2309
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.3988,
+ "step": 2310
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.3523,
+ "step": 2311
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.5766,
+ "step": 2312
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.9096,
+ "step": 2313
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.9315,
+ "step": 2314
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4044,
+ "step": 2315
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.6206,
+ "step": 2316
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.2429,
+ "step": 2317
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0383,
+ "step": 2318
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.4282,
+ "step": 2319
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8973,
+ "step": 2320
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.1771,
+ "step": 2321
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.624,
+ "step": 2322
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.5197,
+ "step": 2323
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.7313,
+ "step": 2324
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8234,
+ "step": 2325
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1702,
+ "step": 2326
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.536,
+ "step": 2327
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.1904,
+ "step": 2328
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.2077,
+ "step": 2329
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.891,
+ "step": 2330
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.6784,
+ "step": 2331
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.6611,
+ "step": 2332
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.3402,
+ "step": 2333
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 11.1523,
+ "step": 2334
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.5547,
+ "step": 2335
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.3485,
+ "step": 2336
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8289,
+ "step": 2337
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.2558,
+ "step": 2338
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1794,
+ "step": 2339
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.8782,
+ "step": 2340
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.415,
+ "step": 2341
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5257,
+ "step": 2342
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4751,
+ "step": 2343
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.2259,
+ "step": 2344
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.8681,
+ "step": 2345
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.6307,
+ "step": 2346
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.1487,
+ "step": 2347
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.3949,
+ "step": 2348
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.6988,
+ "step": 2349
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.1299,
+ "step": 2350
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.9938,
+ "step": 2351
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4176,
+ "step": 2352
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.0184,
+ "step": 2353
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2779,
+ "step": 2354
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.0162,
+ "step": 2355
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.2335,
+ "step": 2356
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 2.5505,
+ "step": 2357
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.6445,
+ "step": 2358
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.6513,
+ "step": 2359
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.8503,
+ "step": 2360
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.1817,
+ "step": 2361
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.4376,
+ "step": 2362
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1351,
+ "step": 2363
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7566,
+ "step": 2364
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.626,
+ "step": 2365
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.5818,
+ "step": 2366
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.3033,
+ "step": 2367
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.9289,
+ "step": 2368
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.0301,
+ "step": 2369
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.4713,
+ "step": 2370
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.0931,
+ "step": 2371
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.5812,
+ "step": 2372
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 9.2272,
+ "step": 2373
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.5174,
+ "step": 2374
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.1849,
+ "step": 2375
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.7496,
+ "step": 2376
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.776,
+ "step": 2377
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.3555,
+ "step": 2378
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.688,
+ "step": 2379
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.0143,
+ "step": 2380
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.7788,
+ "step": 2381
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7772,
+ "step": 2382
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.6875,
+ "step": 2383
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.9944,
+ "step": 2384
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.8363,
+ "step": 2385
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.7276,
+ "step": 2386
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.4892,
+ "step": 2387
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.1083,
+ "step": 2388
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.834,
+ "step": 2389
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.8406,
+ "step": 2390
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.1168,
+ "step": 2391
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 8.2535,
+ "step": 2392
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.9025,
+ "step": 2393
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.4481,
+ "step": 2394
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 5.7631,
+ "step": 2395
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 7.2051,
+ "step": 2396
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.7816,
+ "step": 2397
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 6.2566,
+ "step": 2398
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 4.1125,
+ "step": 2399
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 0.0004,
+ "loss": 3.5952,
+ "step": 2400
+ },
+ {
+ "epoch": 0.02,
+ "eval_loss": 6.616010665893555,
+ "eval_runtime": 22.4801,
+ "eval_samples_per_second": 2.224,
+ "eval_steps_per_second": 1.112,
+ "step": 2400
+ },
+ {
+ "epoch": 0.02,
+ "mmlu_eval_accuracy": 0.2525477994227994,
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
+ "mmlu_eval_accuracy_astronomy": 0.3125,
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
+ "mmlu_loss": 3.427501640319824,
+ "step": 2400
}
],
"max_steps": 30000,
"num_train_epochs": 1,
- "total_flos": 3.036767351173939e+16,
+ "total_flos": 4.010429591529062e+16,
"trial_name": null,
"trial_params": null
}
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-2400/training_args.bin
similarity index 100%
rename from checkpoint-1000/training_args.bin
rename to checkpoint-2400/training_args.bin
diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-400/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-400/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-400/adapter_model.bin b/checkpoint-400/adapter_model.bin
deleted file mode 100644
index 1af18e67704ece5d5afe319037ebf45d777cb2e8..0000000000000000000000000000000000000000
--- a/checkpoint-400/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3b765278d368b2dedcebc9e016a47aac2fe066d1fc3ca9e35226f4ad62815e0
-size 871609293
diff --git a/checkpoint-400/adapter_model/adapter_model/README.md b/checkpoint-400/adapter_model/adapter_model/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-400/adapter_model/adapter_model/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-400/adapter_model/adapter_model/adapter_config.json b/checkpoint-400/adapter_model/adapter_model/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-400/adapter_model/adapter_model/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-400/adapter_model/adapter_model/adapter_model.bin b/checkpoint-400/adapter_model/adapter_model/adapter_model.bin
deleted file mode 100644
index 1af18e67704ece5d5afe319037ebf45d777cb2e8..0000000000000000000000000000000000000000
--- a/checkpoint-400/adapter_model/adapter_model/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3b765278d368b2dedcebc9e016a47aac2fe066d1fc3ca9e35226f4ad62815e0
-size 871609293
diff --git a/checkpoint-400/added_tokens.json b/checkpoint-400/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-400/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt
deleted file mode 100644
index ed4025b722c3d8217a8f1aae6c3f05aa623cdd62..0000000000000000000000000000000000000000
--- a/checkpoint-400/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6cf97dac6fdd8b3b2f12b0c5ebbe6c3f059fddcb90eb13e1fbfabcd1f9fd090d
-size 873873439
diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth
deleted file mode 100644
index bb3bf73a8e279d7032043e584357806f598b3636..0000000000000000000000000000000000000000
--- a/checkpoint-400/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8f63437b5c442ed826ada37c061fce84c2003c7a71e2c64fb861e62d8df0dd68
-size 14511
diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt
deleted file mode 100644
index f658984f48aed2698e469bac0bfffb4ef21e2885..0000000000000000000000000000000000000000
--- a/checkpoint-400/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9ef1ca3e6fc07b43239ed034e2d8e5ae6ded24ae869473b3f8f48afde040dedc
-size 627
diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-400/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-400/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-400/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json
deleted file mode 100644
index fce191e45f7f0cb680ea9c161f06e90cd8d46b40..0000000000000000000000000000000000000000
--- a/checkpoint-400/trainer_state.json
+++ /dev/null
@@ -1,2452 +0,0 @@
-{
- "best_metric": 7.106412410736084,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-400",
- "epoch": 0.003055534336567107,
- "global_step": 400,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3249,
- "step": 201
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.352,
- "step": 202
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2984,
- "step": 203
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.2734,
- "step": 204
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1,
- "step": 205
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.448,
- "step": 206
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2387,
- "step": 207
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.861,
- "step": 208
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.603,
- "step": 209
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.29,
- "step": 210
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2105,
- "step": 211
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1949,
- "step": 212
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0538,
- "step": 213
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0343,
- "step": 214
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7794,
- "step": 215
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5532,
- "step": 216
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2676,
- "step": 217
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 218
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0432,
- "step": 219
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9391,
- "step": 220
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.724,
- "step": 221
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.229,
- "step": 222
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3462,
- "step": 223
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0752,
- "step": 224
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1966,
- "step": 225
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7279,
- "step": 226
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8484,
- "step": 227
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7291,
- "step": 228
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2665,
- "step": 229
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3551,
- "step": 230
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7338,
- "step": 231
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8407,
- "step": 232
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3581,
- "step": 233
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.441,
- "step": 234
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0788,
- "step": 235
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8404,
- "step": 236
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4314,
- "step": 237
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 238
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0205,
- "step": 239
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4162,
- "step": 240
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7515,
- "step": 241
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1442,
- "step": 242
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5868,
- "step": 243
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6514,
- "step": 244
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2683,
- "step": 245
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.31,
- "step": 246
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0161,
- "step": 247
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.484,
- "step": 248
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9726,
- "step": 249
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0926,
- "step": 250
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5279,
- "step": 251
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0017,
- "step": 252
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5684,
- "step": 253
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 254
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9489,
- "step": 255
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8948,
- "step": 256
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0856,
- "step": 257
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.599,
- "step": 258
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1575,
- "step": 259
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3701,
- "step": 260
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.464,
- "step": 261
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9193,
- "step": 262
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5679,
- "step": 263
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9424,
- "step": 264
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6689,
- "step": 265
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6475,
- "step": 266
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4311,
- "step": 267
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7426,
- "step": 268
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5191,
- "step": 269
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3059,
- "step": 270
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0142,
- "step": 271
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.4509,
- "step": 272
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0831,
- "step": 273
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6977,
- "step": 274
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4236,
- "step": 275
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2129,
- "step": 276
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1394,
- "step": 277
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.685,
- "step": 278
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0275,
- "step": 279
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.3215,
- "step": 280
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6542,
- "step": 281
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7614,
- "step": 282
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2996,
- "step": 283
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6275,
- "step": 284
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8736,
- "step": 285
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4667,
- "step": 286
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8486,
- "step": 287
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2125,
- "step": 288
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4523,
- "step": 289
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.551,
- "step": 290
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7158,
- "step": 291
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5092,
- "step": 292
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9169,
- "step": 293
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5333,
- "step": 294
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9949,
- "step": 295
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7189,
- "step": 296
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2366,
- "step": 297
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4745,
- "step": 298
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2439,
- "step": 299
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4176,
- "step": 300
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9365,
- "step": 301
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5309,
- "step": 302
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2201,
- "step": 303
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0312,
- "step": 304
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4173,
- "step": 305
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4856,
- "step": 306
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5041,
- "step": 307
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3597,
- "step": 308
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8395,
- "step": 309
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0776,
- "step": 310
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7566,
- "step": 311
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9767,
- "step": 312
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3804,
- "step": 313
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5327,
- "step": 314
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5293,
- "step": 315
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4531,
- "step": 316
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3961,
- "step": 317
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5669,
- "step": 318
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8559,
- "step": 319
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.117,
- "step": 320
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4279,
- "step": 321
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7977,
- "step": 322
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.955,
- "step": 323
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0164,
- "step": 324
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.0495,
- "step": 325
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2768,
- "step": 326
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3162,
- "step": 327
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.88,
- "step": 328
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2157,
- "step": 329
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8427,
- "step": 330
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9729,
- "step": 331
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1779,
- "step": 332
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1302,
- "step": 333
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7705,
- "step": 334
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.523,
- "step": 335
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9375,
- "step": 336
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.1409,
- "step": 337
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 338
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6481,
- "step": 339
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.933,
- "step": 340
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9179,
- "step": 341
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9332,
- "step": 342
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6553,
- "step": 343
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7412,
- "step": 344
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.849,
- "step": 345
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7321,
- "step": 346
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9717,
- "step": 347
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3465,
- "step": 348
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4535,
- "step": 349
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2376,
- "step": 350
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9025,
- "step": 351
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.916,
- "step": 352
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3785,
- "step": 353
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0576,
- "step": 354
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5081,
- "step": 355
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1303,
- "step": 356
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3854,
- "step": 357
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5553,
- "step": 358
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9627,
- "step": 359
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.402,
- "step": 360
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3484,
- "step": 361
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5428,
- "step": 362
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9128,
- "step": 363
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3934,
- "step": 364
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4812,
- "step": 365
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5395,
- "step": 366
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6304,
- "step": 367
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5626,
- "step": 368
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5693,
- "step": 369
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3458,
- "step": 370
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6254,
- "step": 371
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8706,
- "step": 372
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6076,
- "step": 373
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.2912,
- "step": 374
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3326,
- "step": 375
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3735,
- "step": 376
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4916,
- "step": 377
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5553,
- "step": 378
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6241,
- "step": 379
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6106,
- "step": 380
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.266,
- "step": 381
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7738,
- "step": 382
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4988,
- "step": 383
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2968,
- "step": 384
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8512,
- "step": 385
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0341,
- "step": 386
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.898,
- "step": 387
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.23,
- "step": 388
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9608,
- "step": 389
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.3679,
- "step": 390
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7074,
- "step": 391
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9903,
- "step": 392
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5845,
- "step": 393
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6493,
- "step": 394
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7962,
- "step": 395
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4865,
- "step": 396
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 397
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3942,
- "step": 398
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4715,
- "step": 399
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2073,
- "step": 400
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.106412410736084,
- "eval_runtime": 22.5667,
- "eval_samples_per_second": 2.216,
- "eval_steps_per_second": 1.108,
- "step": 400
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 2.9128687667846678,
- "step": 400
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 6768426179887104.0,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-400/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011
diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-600/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-600/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-600/adapter_model.bin b/checkpoint-600/adapter_model.bin
deleted file mode 100644
index ddebccf356d115c41529e97fe0a1b2c87f2b9fef..0000000000000000000000000000000000000000
--- a/checkpoint-600/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3253f547a72570b385fa5e98301544475b384b591c052ac15628138597a464e5
-size 871609293
diff --git a/checkpoint-600/adapter_model/adapter_model/README.md b/checkpoint-600/adapter_model/adapter_model/README.md
deleted file mode 100644
index 38b20384364a8efcedd053adfbbe9d023582417d..0000000000000000000000000000000000000000
--- a/checkpoint-600/adapter_model/adapter_model/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-- PEFT 0.4.0
-
-- PEFT 0.4.0
diff --git a/checkpoint-600/adapter_model/adapter_model/adapter_config.json b/checkpoint-600/adapter_model/adapter_model/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-600/adapter_model/adapter_model/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-600/adapter_model/adapter_model/adapter_model.bin b/checkpoint-600/adapter_model/adapter_model/adapter_model.bin
deleted file mode 100644
index 563fc6562d3416c3ea0ba611b05fea20cde70754..0000000000000000000000000000000000000000
--- a/checkpoint-600/adapter_model/adapter_model/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6e303e42c088047255552f44d314d0ca5796d75730d458b983db4f73e0981f91
-size 871609293
diff --git a/checkpoint-600/added_tokens.json b/checkpoint-600/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-600/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt
deleted file mode 100644
index 54a9e37e909625d3018a96314a728d17c98e7380..0000000000000000000000000000000000000000
--- a/checkpoint-600/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bf860af281aaf44d8b3d2afcef4e54e1b292e8140f574d1ea0c3a05bfd8a67c4
-size 873873439
diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth
deleted file mode 100644
index f917606bbdb4f2fcef212e0e8ba448db444da843..0000000000000000000000000000000000000000
--- a/checkpoint-600/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a3eff7102725a28942c87ca0535ed134753af6016520ff0479652c01444f5923
-size 14511
diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt
deleted file mode 100644
index c88162200a06347eac69bcd2d6ca4adb6ef01c14..0000000000000000000000000000000000000000
--- a/checkpoint-600/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:375ff21cc0ac3c3d2481c3e10491bf0755513bd8242939e41e1aee1a2d5b88f8
-size 627
diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-600/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-600/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-600/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json
deleted file mode 100644
index 7ae3bc09c23b1f9d773ef4d6049cbce0e251b69f..0000000000000000000000000000000000000000
--- a/checkpoint-600/trainer_state.json
+++ /dev/null
@@ -1,3670 +0,0 @@
-{
- "best_metric": 7.012163162231445,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-600",
- "epoch": 0.004583301504850661,
- "global_step": 600,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3249,
- "step": 201
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.352,
- "step": 202
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2984,
- "step": 203
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.2734,
- "step": 204
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1,
- "step": 205
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.448,
- "step": 206
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2387,
- "step": 207
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.861,
- "step": 208
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.603,
- "step": 209
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.29,
- "step": 210
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2105,
- "step": 211
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1949,
- "step": 212
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0538,
- "step": 213
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0343,
- "step": 214
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7794,
- "step": 215
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5532,
- "step": 216
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2676,
- "step": 217
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 218
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0432,
- "step": 219
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9391,
- "step": 220
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.724,
- "step": 221
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.229,
- "step": 222
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3462,
- "step": 223
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0752,
- "step": 224
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1966,
- "step": 225
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7279,
- "step": 226
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8484,
- "step": 227
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7291,
- "step": 228
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2665,
- "step": 229
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3551,
- "step": 230
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7338,
- "step": 231
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8407,
- "step": 232
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3581,
- "step": 233
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.441,
- "step": 234
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0788,
- "step": 235
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8404,
- "step": 236
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4314,
- "step": 237
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 238
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0205,
- "step": 239
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4162,
- "step": 240
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7515,
- "step": 241
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1442,
- "step": 242
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5868,
- "step": 243
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6514,
- "step": 244
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2683,
- "step": 245
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.31,
- "step": 246
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0161,
- "step": 247
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.484,
- "step": 248
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9726,
- "step": 249
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0926,
- "step": 250
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5279,
- "step": 251
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0017,
- "step": 252
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5684,
- "step": 253
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 254
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9489,
- "step": 255
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8948,
- "step": 256
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0856,
- "step": 257
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.599,
- "step": 258
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1575,
- "step": 259
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3701,
- "step": 260
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.464,
- "step": 261
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9193,
- "step": 262
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5679,
- "step": 263
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9424,
- "step": 264
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6689,
- "step": 265
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6475,
- "step": 266
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4311,
- "step": 267
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7426,
- "step": 268
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5191,
- "step": 269
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3059,
- "step": 270
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0142,
- "step": 271
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.4509,
- "step": 272
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0831,
- "step": 273
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6977,
- "step": 274
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4236,
- "step": 275
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2129,
- "step": 276
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1394,
- "step": 277
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.685,
- "step": 278
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0275,
- "step": 279
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.3215,
- "step": 280
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6542,
- "step": 281
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7614,
- "step": 282
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2996,
- "step": 283
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6275,
- "step": 284
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8736,
- "step": 285
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4667,
- "step": 286
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8486,
- "step": 287
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2125,
- "step": 288
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4523,
- "step": 289
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.551,
- "step": 290
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7158,
- "step": 291
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5092,
- "step": 292
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9169,
- "step": 293
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5333,
- "step": 294
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9949,
- "step": 295
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7189,
- "step": 296
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2366,
- "step": 297
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4745,
- "step": 298
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2439,
- "step": 299
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4176,
- "step": 300
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9365,
- "step": 301
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5309,
- "step": 302
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2201,
- "step": 303
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0312,
- "step": 304
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4173,
- "step": 305
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4856,
- "step": 306
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5041,
- "step": 307
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3597,
- "step": 308
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8395,
- "step": 309
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0776,
- "step": 310
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7566,
- "step": 311
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9767,
- "step": 312
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3804,
- "step": 313
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5327,
- "step": 314
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5293,
- "step": 315
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4531,
- "step": 316
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3961,
- "step": 317
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5669,
- "step": 318
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8559,
- "step": 319
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.117,
- "step": 320
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4279,
- "step": 321
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7977,
- "step": 322
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.955,
- "step": 323
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0164,
- "step": 324
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.0495,
- "step": 325
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2768,
- "step": 326
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3162,
- "step": 327
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.88,
- "step": 328
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2157,
- "step": 329
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8427,
- "step": 330
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9729,
- "step": 331
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1779,
- "step": 332
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1302,
- "step": 333
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7705,
- "step": 334
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.523,
- "step": 335
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9375,
- "step": 336
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.1409,
- "step": 337
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 338
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6481,
- "step": 339
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.933,
- "step": 340
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9179,
- "step": 341
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9332,
- "step": 342
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6553,
- "step": 343
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7412,
- "step": 344
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.849,
- "step": 345
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7321,
- "step": 346
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9717,
- "step": 347
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3465,
- "step": 348
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4535,
- "step": 349
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2376,
- "step": 350
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9025,
- "step": 351
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.916,
- "step": 352
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3785,
- "step": 353
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0576,
- "step": 354
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5081,
- "step": 355
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1303,
- "step": 356
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3854,
- "step": 357
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5553,
- "step": 358
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9627,
- "step": 359
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.402,
- "step": 360
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3484,
- "step": 361
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5428,
- "step": 362
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9128,
- "step": 363
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3934,
- "step": 364
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4812,
- "step": 365
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5395,
- "step": 366
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6304,
- "step": 367
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5626,
- "step": 368
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5693,
- "step": 369
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3458,
- "step": 370
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6254,
- "step": 371
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8706,
- "step": 372
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6076,
- "step": 373
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.2912,
- "step": 374
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3326,
- "step": 375
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3735,
- "step": 376
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4916,
- "step": 377
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5553,
- "step": 378
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6241,
- "step": 379
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6106,
- "step": 380
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.266,
- "step": 381
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7738,
- "step": 382
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4988,
- "step": 383
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2968,
- "step": 384
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8512,
- "step": 385
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0341,
- "step": 386
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.898,
- "step": 387
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.23,
- "step": 388
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9608,
- "step": 389
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.3679,
- "step": 390
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7074,
- "step": 391
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9903,
- "step": 392
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5845,
- "step": 393
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6493,
- "step": 394
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7962,
- "step": 395
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4865,
- "step": 396
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 397
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3942,
- "step": 398
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4715,
- "step": 399
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2073,
- "step": 400
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.106412410736084,
- "eval_runtime": 22.5667,
- "eval_samples_per_second": 2.216,
- "eval_steps_per_second": 1.108,
- "step": 400
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 2.9128687667846678,
- "step": 400
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3984,
- "step": 401
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7983,
- "step": 402
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8589,
- "step": 403
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9884,
- "step": 404
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4427,
- "step": 405
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0374,
- "step": 406
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7999,
- "step": 407
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2437,
- "step": 408
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6902,
- "step": 409
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.81,
- "step": 410
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8979,
- "step": 411
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0211,
- "step": 412
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3945,
- "step": 413
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5807,
- "step": 414
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1433,
- "step": 415
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9466,
- "step": 416
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6276,
- "step": 417
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4945,
- "step": 418
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6215,
- "step": 419
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.3919,
- "step": 420
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7915,
- "step": 421
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3284,
- "step": 422
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8723,
- "step": 423
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0149,
- "step": 424
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.979,
- "step": 425
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9175,
- "step": 426
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4994,
- "step": 427
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9791,
- "step": 428
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1156,
- "step": 429
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5813,
- "step": 430
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1882,
- "step": 431
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9956,
- "step": 432
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6189,
- "step": 433
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9624,
- "step": 434
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5387,
- "step": 435
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4605,
- "step": 436
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.474,
- "step": 437
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 438
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5705,
- "step": 439
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.275,
- "step": 440
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9638,
- "step": 441
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4857,
- "step": 442
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3067,
- "step": 443
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8152,
- "step": 444
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1668,
- "step": 445
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5293,
- "step": 446
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3981,
- "step": 447
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4787,
- "step": 448
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5981,
- "step": 449
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3569,
- "step": 450
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4088,
- "step": 451
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3677,
- "step": 452
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4686,
- "step": 453
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3552,
- "step": 454
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7931,
- "step": 455
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9285,
- "step": 456
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0554,
- "step": 457
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7277,
- "step": 458
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2474,
- "step": 459
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9274,
- "step": 460
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2558,
- "step": 461
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7547,
- "step": 462
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1264,
- "step": 463
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2124,
- "step": 464
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8751,
- "step": 465
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7317,
- "step": 466
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3697,
- "step": 467
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0021,
- "step": 468
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3761,
- "step": 469
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2291,
- "step": 470
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7968,
- "step": 471
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9454,
- "step": 472
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0194,
- "step": 473
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5048,
- "step": 474
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6837,
- "step": 475
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1066,
- "step": 476
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3501,
- "step": 477
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5071,
- "step": 478
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1086,
- "step": 479
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7269,
- "step": 480
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5419,
- "step": 481
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 482
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1433,
- "step": 483
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0869,
- "step": 484
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.032,
- "step": 485
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0946,
- "step": 486
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7162,
- "step": 487
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0406,
- "step": 488
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9048,
- "step": 489
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2231,
- "step": 490
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.6524,
- "step": 491
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1151,
- "step": 492
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.591,
- "step": 493
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1628,
- "step": 494
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0757,
- "step": 495
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3471,
- "step": 496
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9385,
- "step": 497
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9362,
- "step": 498
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2252,
- "step": 499
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.359,
- "step": 500
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 501
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0484,
- "step": 502
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5773,
- "step": 503
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.39,
- "step": 504
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5923,
- "step": 505
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2,
- "step": 506
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5536,
- "step": 507
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.8958,
- "step": 508
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7763,
- "step": 509
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2045,
- "step": 510
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4219,
- "step": 511
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6305,
- "step": 512
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4243,
- "step": 513
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7842,
- "step": 514
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8769,
- "step": 515
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8903,
- "step": 516
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0489,
- "step": 517
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1314,
- "step": 518
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5973,
- "step": 519
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8022,
- "step": 520
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3539,
- "step": 521
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.222,
- "step": 522
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5403,
- "step": 523
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1323,
- "step": 524
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7813,
- "step": 525
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4982,
- "step": 526
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2426,
- "step": 527
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0142,
- "step": 528
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8996,
- "step": 529
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8671,
- "step": 530
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4139,
- "step": 531
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9478,
- "step": 532
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7062,
- "step": 533
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0098,
- "step": 534
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9195,
- "step": 535
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0255,
- "step": 536
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6291,
- "step": 537
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3245,
- "step": 538
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6382,
- "step": 539
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8076,
- "step": 540
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6725,
- "step": 541
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0563,
- "step": 542
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6178,
- "step": 543
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7974,
- "step": 544
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7535,
- "step": 545
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4948,
- "step": 546
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8941,
- "step": 547
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6496,
- "step": 548
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9084,
- "step": 549
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.65,
- "step": 550
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7673,
- "step": 551
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2221,
- "step": 552
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.14,
- "step": 553
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6747,
- "step": 554
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8009,
- "step": 555
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7307,
- "step": 556
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0143,
- "step": 557
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8098,
- "step": 558
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.026,
- "step": 559
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4572,
- "step": 560
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7913,
- "step": 561
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9962,
- "step": 562
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.767,
- "step": 563
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9497,
- "step": 564
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9626,
- "step": 565
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2536,
- "step": 566
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0421,
- "step": 567
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8177,
- "step": 568
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9241,
- "step": 569
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0162,
- "step": 570
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3368,
- "step": 571
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7515,
- "step": 572
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6389,
- "step": 573
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.662,
- "step": 574
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8097,
- "step": 575
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9346,
- "step": 576
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3154,
- "step": 577
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7724,
- "step": 578
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3685,
- "step": 579
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2775,
- "step": 580
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.106,
- "step": 581
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4733,
- "step": 582
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2334,
- "step": 583
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9478,
- "step": 584
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0013,
- "step": 585
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7242,
- "step": 586
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.922,
- "step": 587
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1418,
- "step": 588
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4472,
- "step": 589
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4785,
- "step": 590
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.783,
- "step": 591
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0706,
- "step": 592
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4136,
- "step": 593
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5969,
- "step": 594
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5157,
- "step": 595
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5658,
- "step": 596
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4647,
- "step": 597
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2028,
- "step": 598
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6913,
- "step": 599
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7239,
- "step": 600
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.012163162231445,
- "eval_runtime": 22.5807,
- "eval_samples_per_second": 2.214,
- "eval_steps_per_second": 1.107,
- "step": 600
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 4.24488224029541,
- "step": 600
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 9953457954422784.0,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-600/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011
diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md
deleted file mode 100644
index 82793f73e61dbb024e11fc6697bba1622d4d0db6..0000000000000000000000000000000000000000
--- a/checkpoint-800/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-library_name: peft
----
-## Training procedure
-
-
-The following `bitsandbytes` quantization config was used during training:
-- load_in_8bit: False
-- load_in_4bit: True
-- llm_int8_threshold: 6.0
-- llm_int8_skip_modules: None
-- llm_int8_enable_fp32_cpu_offload: False
-- llm_int8_has_fp16_weight: False
-- bnb_4bit_quant_type: nf4
-- bnb_4bit_use_double_quant: True
-- bnb_4bit_compute_dtype: bfloat16
-### Framework versions
-
-
-- PEFT 0.4.0
diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json
deleted file mode 100644
index a2f0ea437da66b2120cc72d92fb46f999dfb8535..0000000000000000000000000000000000000000
--- a/checkpoint-800/adapter_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "auto_mapping": null,
- "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
- "bias": "none",
- "fan_in_fan_out": false,
- "inference_mode": true,
- "init_lora_weights": true,
- "layers_pattern": null,
- "layers_to_transform": null,
- "lora_alpha": 16.0,
- "lora_dropout": 0.1,
- "modules_to_save": null,
- "peft_type": "LORA",
- "r": 64,
- "revision": null,
- "target_modules": [
- "down_proj",
- "up_proj",
- "q_proj",
- "gate_proj",
- "o_proj",
- "v_proj",
- "k_proj"
- ],
- "task_type": "CAUSAL_LM"
-}
\ No newline at end of file
diff --git a/checkpoint-800/adapter_model.bin b/checkpoint-800/adapter_model.bin
deleted file mode 100644
index 563fc6562d3416c3ea0ba611b05fea20cde70754..0000000000000000000000000000000000000000
--- a/checkpoint-800/adapter_model.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6e303e42c088047255552f44d314d0ca5796d75730d458b983db4f73e0981f91
-size 871609293
diff --git a/checkpoint-800/added_tokens.json b/checkpoint-800/added_tokens.json
deleted file mode 100644
index e41416ddd79948246ea2dced6800ea3cd531c424..0000000000000000000000000000000000000000
--- a/checkpoint-800/added_tokens.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "[PAD]": 32000
-}
diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt
deleted file mode 100644
index 54d0ecc4ecd7e155f1f12c395c722f01d28bdfe8..0000000000000000000000000000000000000000
--- a/checkpoint-800/optimizer.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7cd06031b6e8ae4f33d0fd9ab0279565bb16805f36b7f8c9c502e763e55387db
-size 873873439
diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth
deleted file mode 100644
index 6739212c41f0943d45eeefce0617a9c1fc6ebc46..0000000000000000000000000000000000000000
--- a/checkpoint-800/rng_state.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:40ea66c6576e607dd3b03262737e2cc373926f59159028dee1a22e9d4bb9137e
-size 14511
diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt
deleted file mode 100644
index 3928d63d5c99298229914f0696159f4c207e9e91..0000000000000000000000000000000000000000
--- a/checkpoint-800/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f65bc6eebca3baf3a9e642997ebf957326052a11f3e513f00b005285909a6c9b
-size 627
diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json
deleted file mode 100644
index 3f58a5e115855c6ea3cec98accae196ad927222e..0000000000000000000000000000000000000000
--- a/checkpoint-800/special_tokens_map.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "bos_token": "",
- "eos_token": "",
- "pad_token": "[PAD]",
- "unk_token": ""
-}
diff --git a/checkpoint-800/tokenizer.model b/checkpoint-800/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/checkpoint-800/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json
deleted file mode 100644
index daaef2433dab9469de98b5b9a3848221ab25b7e8..0000000000000000000000000000000000000000
--- a/checkpoint-800/tokenizer_config.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "add_bos_token": true,
- "add_eos_token": false,
- "bos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "clean_up_tokenization_spaces": false,
- "eos_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- },
- "legacy": null,
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": null,
- "padding_side": "right",
- "sp_model_kwargs": {},
- "tokenizer_class": "LlamaTokenizer",
- "unk_token": {
- "__type": "AddedToken",
- "content": "",
- "lstrip": false,
- "normalized": true,
- "rstrip": false,
- "single_word": false
- }
-}
diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json
deleted file mode 100644
index a050b4cadaca42f41af73048c2c0663c180694e9..0000000000000000000000000000000000000000
--- a/checkpoint-800/trainer_state.json
+++ /dev/null
@@ -1,4888 +0,0 @@
-{
- "best_metric": 7.012163162231445,
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-600",
- "epoch": 0.006111068673134214,
- "global_step": 800,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0808,
- "step": 1
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8773,
- "step": 2
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1965,
- "step": 3
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.118,
- "step": 4
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1773,
- "step": 5
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1165,
- "step": 6
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2666,
- "step": 7
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3704,
- "step": 8
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9976,
- "step": 9
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.985,
- "step": 10
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.0541,
- "step": 11
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.6228,
- "step": 12
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.3651,
- "step": 13
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0867,
- "step": 14
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4422,
- "step": 15
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.7759,
- "step": 16
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1446,
- "step": 17
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0007,
- "step": 18
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.0894,
- "step": 19
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2424,
- "step": 20
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1343,
- "step": 21
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5354,
- "step": 22
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1887,
- "step": 23
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6652,
- "step": 24
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.964,
- "step": 25
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1872,
- "step": 26
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.4722,
- "step": 27
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.1462,
- "step": 28
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0485,
- "step": 29
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.148,
- "step": 30
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7274,
- "step": 31
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.6689,
- "step": 32
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3384,
- "step": 33
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.5354,
- "step": 34
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1976,
- "step": 35
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.8593,
- "step": 36
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9302,
- "step": 37
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5968,
- "step": 38
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3169,
- "step": 39
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1793,
- "step": 40
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8457,
- "step": 41
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.5177,
- "step": 42
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.003,
- "step": 43
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.9928,
- "step": 44
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.2574,
- "step": 45
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 0.3915,
- "step": 46
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4105,
- "step": 47
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.1184,
- "step": 48
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.72,
- "step": 49
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9628,
- "step": 50
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2372,
- "step": 51
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3733,
- "step": 52
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8936,
- "step": 53
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5353,
- "step": 54
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0754,
- "step": 55
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6685,
- "step": 56
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8984,
- "step": 57
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2265,
- "step": 58
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7696,
- "step": 59
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7349,
- "step": 60
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0221,
- "step": 61
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.1901,
- "step": 62
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.387,
- "step": 63
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7323,
- "step": 64
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2077,
- "step": 65
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3155,
- "step": 66
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1656,
- "step": 67
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 13.0828,
- "step": 68
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5295,
- "step": 69
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4575,
- "step": 70
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.7654,
- "step": 71
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6263,
- "step": 72
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 24.8238,
- "step": 73
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.0654,
- "step": 74
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 28.1046,
- "step": 75
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.3232,
- "step": 76
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 22.9712,
- "step": 77
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 18.8529,
- "step": 78
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 15.8356,
- "step": 79
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 16.472,
- "step": 80
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.2369,
- "step": 81
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 14.0731,
- "step": 82
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8853,
- "step": 83
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5438,
- "step": 84
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2665,
- "step": 85
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5484,
- "step": 86
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7546,
- "step": 87
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4309,
- "step": 88
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5593,
- "step": 89
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3822,
- "step": 90
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6315,
- "step": 91
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6116,
- "step": 92
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2288,
- "step": 93
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0053,
- "step": 94
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.359,
- "step": 95
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9235,
- "step": 96
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 31.9845,
- "step": 97
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.1385,
- "step": 98
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6161,
- "step": 99
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8096,
- "step": 100
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9918,
- "step": 101
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.344,
- "step": 102
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1607,
- "step": 103
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4834,
- "step": 104
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.704,
- "step": 105
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1238,
- "step": 106
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8066,
- "step": 107
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9656,
- "step": 108
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1979,
- "step": 109
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2294,
- "step": 110
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.066,
- "step": 111
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7914,
- "step": 112
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7344,
- "step": 113
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6703,
- "step": 114
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8817,
- "step": 115
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7733,
- "step": 116
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.469,
- "step": 117
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1304,
- "step": 118
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.871,
- "step": 119
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5353,
- "step": 120
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9055,
- "step": 121
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6142,
- "step": 122
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0201,
- "step": 123
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3805,
- "step": 124
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6825,
- "step": 125
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7166,
- "step": 126
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7747,
- "step": 127
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7695,
- "step": 128
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7291,
- "step": 129
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1296,
- "step": 130
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5374,
- "step": 131
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1854,
- "step": 132
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.434,
- "step": 133
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.438,
- "step": 134
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 135
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.382,
- "step": 136
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9277,
- "step": 137
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.223,
- "step": 138
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3042,
- "step": 139
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6361,
- "step": 140
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3547,
- "step": 141
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7181,
- "step": 142
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.7528,
- "step": 143
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.4316,
- "step": 144
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2219,
- "step": 145
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7788,
- "step": 146
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2749,
- "step": 147
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2397,
- "step": 148
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6243,
- "step": 149
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.145,
- "step": 150
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7951,
- "step": 151
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1862,
- "step": 152
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.1305,
- "step": 153
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5766,
- "step": 154
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9232,
- "step": 155
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9936,
- "step": 156
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9692,
- "step": 157
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2772,
- "step": 158
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.302,
- "step": 159
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9931,
- "step": 160
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9675,
- "step": 161
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.8536,
- "step": 162
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6589,
- "step": 163
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.932,
- "step": 164
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0301,
- "step": 165
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4861,
- "step": 166
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1354,
- "step": 167
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0717,
- "step": 168
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9346,
- "step": 169
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9373,
- "step": 170
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8777,
- "step": 171
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4193,
- "step": 172
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6831,
- "step": 173
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4175,
- "step": 174
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3629,
- "step": 175
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.118,
- "step": 176
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 177
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8355,
- "step": 178
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4522,
- "step": 179
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9272,
- "step": 180
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4631,
- "step": 181
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2987,
- "step": 182
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1183,
- "step": 183
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9976,
- "step": 184
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0668,
- "step": 185
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6291,
- "step": 186
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5937,
- "step": 187
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7382,
- "step": 188
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7677,
- "step": 189
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0293,
- "step": 190
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6407,
- "step": 191
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9508,
- "step": 192
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.5053,
- "step": 193
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5718,
- "step": 194
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5211,
- "step": 195
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9557,
- "step": 196
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1609,
- "step": 197
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8505,
- "step": 198
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8278,
- "step": 199
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8447,
- "step": 200
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.883856773376465,
- "eval_runtime": 22.4254,
- "eval_samples_per_second": 2.23,
- "eval_steps_per_second": 1.115,
- "step": 200
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 4.629522514343262,
- "step": 200
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3249,
- "step": 201
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.352,
- "step": 202
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2984,
- "step": 203
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.2734,
- "step": 204
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1,
- "step": 205
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.448,
- "step": 206
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2387,
- "step": 207
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.861,
- "step": 208
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.603,
- "step": 209
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.29,
- "step": 210
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2105,
- "step": 211
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1949,
- "step": 212
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0538,
- "step": 213
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0343,
- "step": 214
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7794,
- "step": 215
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5532,
- "step": 216
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2676,
- "step": 217
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.566,
- "step": 218
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0432,
- "step": 219
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9391,
- "step": 220
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.724,
- "step": 221
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.229,
- "step": 222
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3462,
- "step": 223
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0752,
- "step": 224
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.1966,
- "step": 225
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7279,
- "step": 226
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8484,
- "step": 227
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7291,
- "step": 228
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2665,
- "step": 229
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3551,
- "step": 230
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7338,
- "step": 231
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8407,
- "step": 232
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3581,
- "step": 233
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.441,
- "step": 234
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0788,
- "step": 235
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8404,
- "step": 236
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4314,
- "step": 237
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8426,
- "step": 238
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.0205,
- "step": 239
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4162,
- "step": 240
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7515,
- "step": 241
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1442,
- "step": 242
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5868,
- "step": 243
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6514,
- "step": 244
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2683,
- "step": 245
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.31,
- "step": 246
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0161,
- "step": 247
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.484,
- "step": 248
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9726,
- "step": 249
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.0926,
- "step": 250
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5279,
- "step": 251
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0017,
- "step": 252
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5684,
- "step": 253
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3875,
- "step": 254
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9489,
- "step": 255
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.8948,
- "step": 256
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0856,
- "step": 257
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.599,
- "step": 258
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1575,
- "step": 259
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3701,
- "step": 260
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.464,
- "step": 261
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9193,
- "step": 262
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5679,
- "step": 263
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9424,
- "step": 264
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6689,
- "step": 265
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6475,
- "step": 266
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4311,
- "step": 267
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7426,
- "step": 268
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5191,
- "step": 269
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3059,
- "step": 270
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0142,
- "step": 271
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.4509,
- "step": 272
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0831,
- "step": 273
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6977,
- "step": 274
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4236,
- "step": 275
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2129,
- "step": 276
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1394,
- "step": 277
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.685,
- "step": 278
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0275,
- "step": 279
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.3215,
- "step": 280
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6542,
- "step": 281
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7614,
- "step": 282
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2996,
- "step": 283
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6275,
- "step": 284
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8736,
- "step": 285
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4667,
- "step": 286
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8486,
- "step": 287
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2125,
- "step": 288
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4523,
- "step": 289
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.551,
- "step": 290
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.7158,
- "step": 291
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5092,
- "step": 292
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9169,
- "step": 293
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5333,
- "step": 294
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9949,
- "step": 295
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7189,
- "step": 296
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2366,
- "step": 297
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4745,
- "step": 298
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2439,
- "step": 299
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4176,
- "step": 300
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.9365,
- "step": 301
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5309,
- "step": 302
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2201,
- "step": 303
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0312,
- "step": 304
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4173,
- "step": 305
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4856,
- "step": 306
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5041,
- "step": 307
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3597,
- "step": 308
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8395,
- "step": 309
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0776,
- "step": 310
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7566,
- "step": 311
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9767,
- "step": 312
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.3804,
- "step": 313
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5327,
- "step": 314
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5293,
- "step": 315
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4531,
- "step": 316
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3961,
- "step": 317
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5669,
- "step": 318
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8559,
- "step": 319
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.117,
- "step": 320
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4279,
- "step": 321
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7977,
- "step": 322
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.955,
- "step": 323
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0164,
- "step": 324
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 12.0495,
- "step": 325
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2768,
- "step": 326
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3162,
- "step": 327
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.88,
- "step": 328
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2157,
- "step": 329
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8427,
- "step": 330
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9729,
- "step": 331
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1779,
- "step": 332
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1302,
- "step": 333
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7705,
- "step": 334
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.523,
- "step": 335
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9375,
- "step": 336
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.1409,
- "step": 337
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.633,
- "step": 338
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6481,
- "step": 339
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.933,
- "step": 340
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9179,
- "step": 341
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9332,
- "step": 342
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6553,
- "step": 343
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7412,
- "step": 344
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.849,
- "step": 345
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.7321,
- "step": 346
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9717,
- "step": 347
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3465,
- "step": 348
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4535,
- "step": 349
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2376,
- "step": 350
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9025,
- "step": 351
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.916,
- "step": 352
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.3785,
- "step": 353
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0576,
- "step": 354
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5081,
- "step": 355
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1303,
- "step": 356
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3854,
- "step": 357
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 11.5553,
- "step": 358
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9627,
- "step": 359
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.402,
- "step": 360
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3484,
- "step": 361
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5428,
- "step": 362
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9128,
- "step": 363
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3934,
- "step": 364
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4812,
- "step": 365
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5395,
- "step": 366
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6304,
- "step": 367
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5626,
- "step": 368
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5693,
- "step": 369
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3458,
- "step": 370
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6254,
- "step": 371
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8706,
- "step": 372
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6076,
- "step": 373
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.2912,
- "step": 374
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3326,
- "step": 375
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3735,
- "step": 376
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4916,
- "step": 377
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5553,
- "step": 378
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6241,
- "step": 379
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6106,
- "step": 380
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.266,
- "step": 381
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7738,
- "step": 382
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4988,
- "step": 383
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2968,
- "step": 384
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8512,
- "step": 385
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0341,
- "step": 386
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.898,
- "step": 387
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.23,
- "step": 388
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9608,
- "step": 389
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.3679,
- "step": 390
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.7074,
- "step": 391
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9903,
- "step": 392
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5845,
- "step": 393
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6493,
- "step": 394
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7962,
- "step": 395
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4865,
- "step": 396
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 397
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3942,
- "step": 398
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4715,
- "step": 399
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.2073,
- "step": 400
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.106412410736084,
- "eval_runtime": 22.5667,
- "eval_samples_per_second": 2.216,
- "eval_steps_per_second": 1.108,
- "step": 400
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 2.9128687667846678,
- "step": 400
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3984,
- "step": 401
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7983,
- "step": 402
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.8589,
- "step": 403
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9884,
- "step": 404
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.4427,
- "step": 405
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0374,
- "step": 406
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7999,
- "step": 407
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2437,
- "step": 408
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6902,
- "step": 409
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.81,
- "step": 410
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8979,
- "step": 411
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0211,
- "step": 412
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3945,
- "step": 413
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.5807,
- "step": 414
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1433,
- "step": 415
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9466,
- "step": 416
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6276,
- "step": 417
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4945,
- "step": 418
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6215,
- "step": 419
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.3919,
- "step": 420
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7915,
- "step": 421
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3284,
- "step": 422
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8723,
- "step": 423
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0149,
- "step": 424
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.979,
- "step": 425
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9175,
- "step": 426
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4994,
- "step": 427
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.9791,
- "step": 428
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1156,
- "step": 429
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5813,
- "step": 430
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1882,
- "step": 431
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9956,
- "step": 432
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6189,
- "step": 433
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9624,
- "step": 434
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5387,
- "step": 435
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4605,
- "step": 436
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.474,
- "step": 437
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 438
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5705,
- "step": 439
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.275,
- "step": 440
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9638,
- "step": 441
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4857,
- "step": 442
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3067,
- "step": 443
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8152,
- "step": 444
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1668,
- "step": 445
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5293,
- "step": 446
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3981,
- "step": 447
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4787,
- "step": 448
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5981,
- "step": 449
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3569,
- "step": 450
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4088,
- "step": 451
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3677,
- "step": 452
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4686,
- "step": 453
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3552,
- "step": 454
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7931,
- "step": 455
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.9285,
- "step": 456
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0554,
- "step": 457
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7277,
- "step": 458
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2474,
- "step": 459
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9274,
- "step": 460
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2558,
- "step": 461
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7547,
- "step": 462
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1264,
- "step": 463
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2124,
- "step": 464
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8751,
- "step": 465
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7317,
- "step": 466
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3697,
- "step": 467
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0021,
- "step": 468
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3761,
- "step": 469
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2291,
- "step": 470
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7968,
- "step": 471
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9454,
- "step": 472
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0194,
- "step": 473
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5048,
- "step": 474
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6837,
- "step": 475
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1066,
- "step": 476
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3501,
- "step": 477
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5071,
- "step": 478
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1086,
- "step": 479
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7269,
- "step": 480
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5419,
- "step": 481
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2974,
- "step": 482
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.1433,
- "step": 483
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0869,
- "step": 484
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.032,
- "step": 485
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0946,
- "step": 486
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7162,
- "step": 487
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.0406,
- "step": 488
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9048,
- "step": 489
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2231,
- "step": 490
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.6524,
- "step": 491
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1151,
- "step": 492
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.591,
- "step": 493
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.1628,
- "step": 494
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0757,
- "step": 495
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.3471,
- "step": 496
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9385,
- "step": 497
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9362,
- "step": 498
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2252,
- "step": 499
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.359,
- "step": 500
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0497,
- "step": 501
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0484,
- "step": 502
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5773,
- "step": 503
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.39,
- "step": 504
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5923,
- "step": 505
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2,
- "step": 506
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5536,
- "step": 507
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.8958,
- "step": 508
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7763,
- "step": 509
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2045,
- "step": 510
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4219,
- "step": 511
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6305,
- "step": 512
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.4243,
- "step": 513
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7842,
- "step": 514
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8769,
- "step": 515
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8903,
- "step": 516
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0489,
- "step": 517
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1314,
- "step": 518
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.5973,
- "step": 519
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.8022,
- "step": 520
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3539,
- "step": 521
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.222,
- "step": 522
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5403,
- "step": 523
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.1323,
- "step": 524
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7813,
- "step": 525
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4982,
- "step": 526
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2426,
- "step": 527
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0142,
- "step": 528
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8996,
- "step": 529
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.8671,
- "step": 530
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4139,
- "step": 531
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9478,
- "step": 532
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7062,
- "step": 533
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0098,
- "step": 534
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9195,
- "step": 535
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0255,
- "step": 536
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.6291,
- "step": 537
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3245,
- "step": 538
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6382,
- "step": 539
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 1.8076,
- "step": 540
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6725,
- "step": 541
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0563,
- "step": 542
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.6178,
- "step": 543
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7974,
- "step": 544
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.7535,
- "step": 545
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4948,
- "step": 546
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8941,
- "step": 547
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.6496,
- "step": 548
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.9084,
- "step": 549
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.65,
- "step": 550
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7673,
- "step": 551
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 10.2221,
- "step": 552
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.14,
- "step": 553
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.6747,
- "step": 554
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8009,
- "step": 555
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7307,
- "step": 556
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0143,
- "step": 557
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.8098,
- "step": 558
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.026,
- "step": 559
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4572,
- "step": 560
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7913,
- "step": 561
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9962,
- "step": 562
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.767,
- "step": 563
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.9497,
- "step": 564
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.9626,
- "step": 565
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2536,
- "step": 566
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0421,
- "step": 567
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.8177,
- "step": 568
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9241,
- "step": 569
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0162,
- "step": 570
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.3368,
- "step": 571
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7515,
- "step": 572
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.6389,
- "step": 573
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.662,
- "step": 574
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8097,
- "step": 575
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9346,
- "step": 576
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.3154,
- "step": 577
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.7724,
- "step": 578
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3685,
- "step": 579
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2775,
- "step": 580
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.106,
- "step": 581
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4733,
- "step": 582
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2334,
- "step": 583
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9478,
- "step": 584
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0013,
- "step": 585
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.7242,
- "step": 586
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.922,
- "step": 587
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.1418,
- "step": 588
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4472,
- "step": 589
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.4785,
- "step": 590
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.783,
- "step": 591
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.0706,
- "step": 592
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4136,
- "step": 593
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.5969,
- "step": 594
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5157,
- "step": 595
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.5658,
- "step": 596
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.4647,
- "step": 597
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.2028,
- "step": 598
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.6913,
- "step": 599
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7239,
- "step": 600
- },
- {
- "epoch": 0.0,
- "eval_loss": 7.012163162231445,
- "eval_runtime": 22.5807,
- "eval_samples_per_second": 2.214,
- "eval_steps_per_second": 1.107,
- "step": 600
- },
- {
- "epoch": 0.0,
- "mmlu_eval_accuracy": 0.3260281385281385,
- "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
- "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
- "mmlu_eval_accuracy_astronomy": 0.25,
- "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
- "mmlu_loss": 4.24488224029541,
- "step": 600
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.5253,
- "step": 601
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0392,
- "step": 602
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.447,
- "step": 603
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.9441,
- "step": 604
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.1874,
- "step": 605
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.7817,
- "step": 606
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0348,
- "step": 607
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.5593,
- "step": 608
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.9361,
- "step": 609
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3534,
- "step": 610
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.476,
- "step": 611
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0937,
- "step": 612
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3027,
- "step": 613
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.5586,
- "step": 614
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3796,
- "step": 615
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.676,
- "step": 616
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5321,
- "step": 617
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.0059,
- "step": 618
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.6139,
- "step": 619
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.2391,
- "step": 620
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.0636,
- "step": 621
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0895,
- "step": 622
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.62,
- "step": 623
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.0469,
- "step": 624
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.2173,
- "step": 625
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 3.9432,
- "step": 626
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.3928,
- "step": 627
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0959,
- "step": 628
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.1197,
- "step": 629
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 2.4277,
- "step": 630
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.418,
- "step": 631
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8687,
- "step": 632
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.0156,
- "step": 633
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.573,
- "step": 634
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.112,
- "step": 635
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8954,
- "step": 636
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.36,
- "step": 637
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.924,
- "step": 638
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.4625,
- "step": 639
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.2023,
- "step": 640
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.0685,
- "step": 641
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.5304,
- "step": 642
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4456,
- "step": 643
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.7271,
- "step": 644
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.6011,
- "step": 645
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.895,
- "step": 646
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 4.864,
- "step": 647
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.3452,
- "step": 648
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 5.8978,
- "step": 649
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.2253,
- "step": 650
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 9.2813,
- "step": 651
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 7.7248,
- "step": 652
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 8.4283,
- "step": 653
- },
- {
- "epoch": 0.0,
- "learning_rate": 0.0004,
- "loss": 6.4304,
- "step": 654
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3893,
- "step": 655
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1115,
- "step": 656
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.5892,
- "step": 657
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6572,
- "step": 658
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.925,
- "step": 659
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4431,
- "step": 660
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7711,
- "step": 661
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9439,
- "step": 662
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3781,
- "step": 663
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5573,
- "step": 664
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.4476,
- "step": 665
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0057,
- "step": 666
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2702,
- "step": 667
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5717,
- "step": 668
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2242,
- "step": 669
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1,
- "step": 670
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0517,
- "step": 671
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.6543,
- "step": 672
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1138,
- "step": 673
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.461,
- "step": 674
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.7094,
- "step": 675
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.521,
- "step": 676
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7116,
- "step": 677
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6343,
- "step": 678
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3762,
- "step": 679
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3603,
- "step": 680
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7144,
- "step": 681
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4545,
- "step": 682
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8188,
- "step": 683
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7965,
- "step": 684
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4675,
- "step": 685
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0436,
- "step": 686
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1219,
- "step": 687
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4517,
- "step": 688
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8476,
- "step": 689
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.9284,
- "step": 690
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.7405,
- "step": 691
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7142,
- "step": 692
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3979,
- "step": 693
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 1.3285,
- "step": 694
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.3418,
- "step": 695
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4472,
- "step": 696
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7355,
- "step": 697
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7982,
- "step": 698
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.4516,
- "step": 699
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.2532,
- "step": 700
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9959,
- "step": 701
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0418,
- "step": 702
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7767,
- "step": 703
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.774,
- "step": 704
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8912,
- "step": 705
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2178,
- "step": 706
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.6197,
- "step": 707
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4755,
- "step": 708
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8276,
- "step": 709
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.2925,
- "step": 710
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.3887,
- "step": 711
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.1465,
- "step": 712
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.5806,
- "step": 713
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.3063,
- "step": 714
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6066,
- "step": 715
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.1536,
- "step": 716
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5582,
- "step": 717
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.0353,
- "step": 718
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.6415,
- "step": 719
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8291,
- "step": 720
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 10.7575,
- "step": 721
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9141,
- "step": 722
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.5217,
- "step": 723
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4549,
- "step": 724
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.8112,
- "step": 725
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2729,
- "step": 726
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8515,
- "step": 727
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.9712,
- "step": 728
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.097,
- "step": 729
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0208,
- "step": 730
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1377,
- "step": 731
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.4019,
- "step": 732
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.9869,
- "step": 733
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2954,
- "step": 734
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4144,
- "step": 735
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.8053,
- "step": 736
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.8891,
- "step": 737
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.812,
- "step": 738
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.2657,
- "step": 739
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3747,
- "step": 740
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.0364,
- "step": 741
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.8845,
- "step": 742
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.887,
- "step": 743
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.0706,
- "step": 744
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6619,
- "step": 745
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.2941,
- "step": 746
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9192,
- "step": 747
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.9947,
- "step": 748
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6376,
- "step": 749
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.0358,
- "step": 750
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4578,
- "step": 751
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.7784,
- "step": 752
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.632,
- "step": 753
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.8649,
- "step": 754
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7951,
- "step": 755
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.3841,
- "step": 756
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.4558,
- "step": 757
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.7638,
- "step": 758
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.9413,
- "step": 759
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.0916,
- "step": 760
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1351,
- "step": 761
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6078,
- "step": 762
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7982,
- "step": 763
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6132,
- "step": 764
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.551,
- "step": 765
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3301,
- "step": 766
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4888,
- "step": 767
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1476,
- "step": 768
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4244,
- "step": 769
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.6025,
- "step": 770
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.102,
- "step": 771
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.017,
- "step": 772
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4101,
- "step": 773
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1741,
- "step": 774
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.1256,
- "step": 775
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.5164,
- "step": 776
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 2.6959,
- "step": 777
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.7666,
- "step": 778
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.4336,
- "step": 779
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 11.8478,
- "step": 780
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 6.8382,
- "step": 781
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 8.1792,
- "step": 782
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.4424,
- "step": 783
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.345,
- "step": 784
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.6887,
- "step": 785
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.9867,
- "step": 786
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.6152,
- "step": 787
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.7283,
- "step": 788
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.0157,
- "step": 789
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.6044,
- "step": 790
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.4132,
- "step": 791
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.735,
- "step": 792
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.3631,
- "step": 793
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 3.2308,
- "step": 794
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.2184,
- "step": 795
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 5.4661,
- "step": 796
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 4.9637,
- "step": 797
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 7.4178,
- "step": 798
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.5909,
- "step": 799
- },
- {
- "epoch": 0.01,
- "learning_rate": 0.0004,
- "loss": 9.1482,
- "step": 800
- },
- {
- "epoch": 0.01,
- "eval_loss": 7.355834484100342,
- "eval_runtime": 22.6252,
- "eval_samples_per_second": 2.21,
- "eval_steps_per_second": 1.105,
- "step": 800
- },
- {
- "epoch": 0.01,
- "mmlu_eval_accuracy": 0.2525477994227994,
- "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
- "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
- "mmlu_eval_accuracy_astronomy": 0.3125,
- "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
- "mmlu_loss": 5.191131496429444,
- "step": 800
- }
- ],
- "max_steps": 30000,
- "num_train_epochs": 1,
- "total_flos": 1.313267421609984e+16,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin
deleted file mode 100644
index 29a1b90871dc30211978426049e89f31e2b38f56..0000000000000000000000000000000000000000
--- a/checkpoint-800/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2493c95326c359fb00f88976162bc7966690beaaca22964b91c1db649a04988f
-size 6011