Farouk commited on
Commit
f8b60c0
·
1 Parent(s): 0eb810b

Training in progress, step 6200

Browse files
adapter_config.json CHANGED
@@ -14,13 +14,13 @@
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
- "o_proj",
18
- "k_proj",
19
- "down_proj",
20
  "gate_proj",
 
 
21
  "up_proj",
22
  "v_proj",
23
- "q_proj"
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
 
 
 
17
  "gate_proj",
18
+ "down_proj",
19
+ "q_proj",
20
  "up_proj",
21
  "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcb8e0e5b9b35a1744defdf16c1328ef8088022cb7d068af80b5615e4bcbbb88
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89e3e86e88a2a473616d28f379b3735697c068cbf1c5d7c8fe7b56148a37a0af
3
  size 871609293
all_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "epoch": 0.04,
3
  "eval_loss": 6.335043907165527,
4
- "eval_runtime": 21.6378,
5
- "eval_samples_per_second": 2.311,
6
- "eval_steps_per_second": 1.155,
7
- "train_loss": 0.6445872698006807,
8
- "train_runtime": 1748.3273,
9
- "train_samples_per_second": 17.159,
10
- "train_steps_per_second": 17.159
11
  }
 
1
  {
2
+ "epoch": 0.05,
3
  "eval_loss": 6.335043907165527,
4
+ "eval_runtime": 21.7341,
5
+ "eval_samples_per_second": 2.301,
6
+ "eval_steps_per_second": 1.15,
7
+ "train_loss": 0.5874443841576577,
8
+ "train_runtime": 1725.6374,
9
+ "train_samples_per_second": 17.385,
10
+ "train_steps_per_second": 17.385
11
  }
checkpoint-4200/adapter_model/adapter_model/README.md CHANGED
@@ -114,6 +114,28 @@ The following `bitsandbytes` quantization config was used during training:
114
  - bnb_4bit_use_double_quant: True
115
  - bnb_4bit_compute_dtype: bfloat16
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  The following `bitsandbytes` quantization config was used during training:
118
  - load_in_8bit: False
119
  - load_in_4bit: True
@@ -136,5 +158,7 @@ The following `bitsandbytes` quantization config was used during training:
136
  - PEFT 0.4.0
137
  - PEFT 0.4.0
138
  - PEFT 0.4.0
 
 
139
 
140
  - PEFT 0.4.0
 
114
  - bnb_4bit_use_double_quant: True
115
  - bnb_4bit_compute_dtype: bfloat16
116
 
117
+ The following `bitsandbytes` quantization config was used during training:
118
+ - load_in_8bit: False
119
+ - load_in_4bit: True
120
+ - llm_int8_threshold: 6.0
121
+ - llm_int8_skip_modules: None
122
+ - llm_int8_enable_fp32_cpu_offload: False
123
+ - llm_int8_has_fp16_weight: False
124
+ - bnb_4bit_quant_type: nf4
125
+ - bnb_4bit_use_double_quant: True
126
+ - bnb_4bit_compute_dtype: bfloat16
127
+
128
+ The following `bitsandbytes` quantization config was used during training:
129
+ - load_in_8bit: False
130
+ - load_in_4bit: True
131
+ - llm_int8_threshold: 6.0
132
+ - llm_int8_skip_modules: None
133
+ - llm_int8_enable_fp32_cpu_offload: False
134
+ - llm_int8_has_fp16_weight: False
135
+ - bnb_4bit_quant_type: nf4
136
+ - bnb_4bit_use_double_quant: True
137
+ - bnb_4bit_compute_dtype: bfloat16
138
+
139
  The following `bitsandbytes` quantization config was used during training:
140
  - load_in_8bit: False
141
  - load_in_4bit: True
 
158
  - PEFT 0.4.0
159
  - PEFT 0.4.0
160
  - PEFT 0.4.0
161
+ - PEFT 0.4.0
162
+ - PEFT 0.4.0
163
 
164
  - PEFT 0.4.0
checkpoint-4200/adapter_model/adapter_model/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e479a2816f8579f7d795f26c6e76d4c6617567fcbef52ee3f437547a461fa1c
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff18c40f9b3c9fb20f1c95d4dff151244eba09eee79ae11c6121cc23181c2442
3
  size 871609293
checkpoint-6200/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: False
9
+ - load_in_4bit: True
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: nf4
15
+ - bnb_4bit_use_double_quant: True
16
+ - bnb_4bit_compute_dtype: bfloat16
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-6200/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "q_proj",
20
+ "up_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-6200/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89e3e86e88a2a473616d28f379b3735697c068cbf1c5d7c8fe7b56148a37a0af
3
+ size 871609293
checkpoint-6200/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
checkpoint-6200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f915a3be0537ed39b74e6b133c9652eb8040c1f61e027bd380f53fc1de4740e
3
+ size 873872799
checkpoint-6200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d56947d85c4236c819c7e5ba1f3020a0c401b4caa051580cac172d0a50c72119
3
+ size 14511
checkpoint-6200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81248501833af563175f43c1d681185643b8411cee1fb1e631b8687c465eb2e3
3
+ size 627
checkpoint-6200/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "<unk>"
6
+ }
checkpoint-6200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-6200/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": null,
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": null,
24
+ "padding_side": "right",
25
+ "sp_model_kwargs": {},
26
+ "tokenizer_class": "LlamaTokenizer",
27
+ "unk_token": {
28
+ "__type": "AddedToken",
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }
checkpoint-6200/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-6200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85783faab59f5f6d8bcf691e35bb86cff435e22f3fa9169bf4e56c0239c8d7e4
3
+ size 6011
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 0.04,
3
  "eval_loss": 6.335043907165527,
4
- "eval_runtime": 21.6378,
5
- "eval_samples_per_second": 2.311,
6
- "eval_steps_per_second": 1.155
7
  }
 
1
  {
2
+ "epoch": 0.05,
3
  "eval_loss": 6.335043907165527,
4
+ "eval_runtime": 21.7341,
5
+ "eval_samples_per_second": 2.301,
6
+ "eval_steps_per_second": 1.15
7
  }
metrics.json CHANGED
@@ -1 +1 @@
1
- {"run_name": "codellama34b_unnatural", "train_runtime": 1748.3273, "train_samples_per_second": 17.159, "train_steps_per_second": 17.159, "train_loss": 0.6445872698006807, "epoch": 0.04, "eval_loss": 6.335043907165527, "eval_runtime": 21.6378, "eval_samples_per_second": 2.311, "eval_steps_per_second": 1.155}
 
1
+ {"run_name": "codellama34b_unnatural", "train_runtime": 1725.6374, "train_samples_per_second": 17.385, "train_steps_per_second": 17.385, "train_loss": 0.5874443841576577, "epoch": 0.05, "eval_loss": 6.335043907165527, "eval_runtime": 21.7341, "eval_samples_per_second": 2.301, "eval_steps_per_second": 1.15}
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 0.04,
3
- "train_loss": 0.6445872698006807,
4
- "train_runtime": 1748.3273,
5
- "train_samples_per_second": 17.159,
6
- "train_steps_per_second": 17.159
7
  }
 
1
  {
2
+ "epoch": 0.05,
3
+ "train_loss": 0.5874443841576577,
4
+ "train_runtime": 1725.6374,
5
+ "train_samples_per_second": 17.385,
6
+ "train_steps_per_second": 17.385
7
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": 6.335043907165527,
3
  "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-4200",
4
- "epoch": 0.04124971354365595,
5
- "global_step": 5400,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -32928,11 +32928,3674 @@
32928
  "train_runtime": 1748.3273,
32929
  "train_samples_per_second": 17.159,
32930
  "train_steps_per_second": 17.159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32931
  }
32932
  ],
32933
  "max_steps": 30000,
32934
  "num_train_epochs": 1,
32935
- "total_flos": 8.918950910784307e+16,
32936
  "trial_name": null,
32937
  "trial_params": null
32938
  }
 
1
  {
2
  "best_metric": 6.335043907165527,
3
  "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-4200",
4
+ "epoch": 0.045833015048506606,
5
+ "global_step": 6000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
32928
  "train_runtime": 1748.3273,
32929
  "train_samples_per_second": 17.159,
32930
  "train_steps_per_second": 17.159
32931
+ },
32932
+ {
32933
+ "epoch": 0.04,
32934
+ "learning_rate": 0.0004,
32935
+ "loss": 8.7161,
32936
+ "step": 5401
32937
+ },
32938
+ {
32939
+ "epoch": 0.04,
32940
+ "learning_rate": 0.0004,
32941
+ "loss": 7.6554,
32942
+ "step": 5402
32943
+ },
32944
+ {
32945
+ "epoch": 0.04,
32946
+ "learning_rate": 0.0004,
32947
+ "loss": 6.7865,
32948
+ "step": 5403
32949
+ },
32950
+ {
32951
+ "epoch": 0.04,
32952
+ "learning_rate": 0.0004,
32953
+ "loss": 8.4439,
32954
+ "step": 5404
32955
+ },
32956
+ {
32957
+ "epoch": 0.04,
32958
+ "learning_rate": 0.0004,
32959
+ "loss": 7.0393,
32960
+ "step": 5405
32961
+ },
32962
+ {
32963
+ "epoch": 0.04,
32964
+ "learning_rate": 0.0004,
32965
+ "loss": 7.3437,
32966
+ "step": 5406
32967
+ },
32968
+ {
32969
+ "epoch": 0.04,
32970
+ "learning_rate": 0.0004,
32971
+ "loss": 8.6367,
32972
+ "step": 5407
32973
+ },
32974
+ {
32975
+ "epoch": 0.04,
32976
+ "learning_rate": 0.0004,
32977
+ "loss": 7.3527,
32978
+ "step": 5408
32979
+ },
32980
+ {
32981
+ "epoch": 0.04,
32982
+ "learning_rate": 0.0004,
32983
+ "loss": 7.4897,
32984
+ "step": 5409
32985
+ },
32986
+ {
32987
+ "epoch": 0.04,
32988
+ "learning_rate": 0.0004,
32989
+ "loss": 6.9515,
32990
+ "step": 5410
32991
+ },
32992
+ {
32993
+ "epoch": 0.04,
32994
+ "learning_rate": 0.0004,
32995
+ "loss": 6.6737,
32996
+ "step": 5411
32997
+ },
32998
+ {
32999
+ "epoch": 0.04,
33000
+ "learning_rate": 0.0004,
33001
+ "loss": 4.9802,
33002
+ "step": 5412
33003
+ },
33004
+ {
33005
+ "epoch": 0.04,
33006
+ "learning_rate": 0.0004,
33007
+ "loss": 6.9681,
33008
+ "step": 5413
33009
+ },
33010
+ {
33011
+ "epoch": 0.04,
33012
+ "learning_rate": 0.0004,
33013
+ "loss": 5.0998,
33014
+ "step": 5414
33015
+ },
33016
+ {
33017
+ "epoch": 0.04,
33018
+ "learning_rate": 0.0004,
33019
+ "loss": 5.0552,
33020
+ "step": 5415
33021
+ },
33022
+ {
33023
+ "epoch": 0.04,
33024
+ "learning_rate": 0.0004,
33025
+ "loss": 3.956,
33026
+ "step": 5416
33027
+ },
33028
+ {
33029
+ "epoch": 0.04,
33030
+ "learning_rate": 0.0004,
33031
+ "loss": 7.4181,
33032
+ "step": 5417
33033
+ },
33034
+ {
33035
+ "epoch": 0.04,
33036
+ "learning_rate": 0.0004,
33037
+ "loss": 4.7352,
33038
+ "step": 5418
33039
+ },
33040
+ {
33041
+ "epoch": 0.04,
33042
+ "learning_rate": 0.0004,
33043
+ "loss": 6.7415,
33044
+ "step": 5419
33045
+ },
33046
+ {
33047
+ "epoch": 0.04,
33048
+ "learning_rate": 0.0004,
33049
+ "loss": 4.289,
33050
+ "step": 5420
33051
+ },
33052
+ {
33053
+ "epoch": 0.04,
33054
+ "learning_rate": 0.0004,
33055
+ "loss": 7.1359,
33056
+ "step": 5421
33057
+ },
33058
+ {
33059
+ "epoch": 0.04,
33060
+ "learning_rate": 0.0004,
33061
+ "loss": 3.1149,
33062
+ "step": 5422
33063
+ },
33064
+ {
33065
+ "epoch": 0.04,
33066
+ "learning_rate": 0.0004,
33067
+ "loss": 8.371,
33068
+ "step": 5423
33069
+ },
33070
+ {
33071
+ "epoch": 0.04,
33072
+ "learning_rate": 0.0004,
33073
+ "loss": 2.7619,
33074
+ "step": 5424
33075
+ },
33076
+ {
33077
+ "epoch": 0.04,
33078
+ "learning_rate": 0.0004,
33079
+ "loss": 5.3229,
33080
+ "step": 5425
33081
+ },
33082
+ {
33083
+ "epoch": 0.04,
33084
+ "learning_rate": 0.0004,
33085
+ "loss": 2.5643,
33086
+ "step": 5426
33087
+ },
33088
+ {
33089
+ "epoch": 0.04,
33090
+ "learning_rate": 0.0004,
33091
+ "loss": 6.8752,
33092
+ "step": 5427
33093
+ },
33094
+ {
33095
+ "epoch": 0.04,
33096
+ "learning_rate": 0.0004,
33097
+ "loss": 3.8785,
33098
+ "step": 5428
33099
+ },
33100
+ {
33101
+ "epoch": 0.04,
33102
+ "learning_rate": 0.0004,
33103
+ "loss": 5.06,
33104
+ "step": 5429
33105
+ },
33106
+ {
33107
+ "epoch": 0.04,
33108
+ "learning_rate": 0.0004,
33109
+ "loss": 6.7646,
33110
+ "step": 5430
33111
+ },
33112
+ {
33113
+ "epoch": 0.04,
33114
+ "learning_rate": 0.0004,
33115
+ "loss": 3.774,
33116
+ "step": 5431
33117
+ },
33118
+ {
33119
+ "epoch": 0.04,
33120
+ "learning_rate": 0.0004,
33121
+ "loss": 2.777,
33122
+ "step": 5432
33123
+ },
33124
+ {
33125
+ "epoch": 0.04,
33126
+ "learning_rate": 0.0004,
33127
+ "loss": 7.2203,
33128
+ "step": 5433
33129
+ },
33130
+ {
33131
+ "epoch": 0.04,
33132
+ "learning_rate": 0.0004,
33133
+ "loss": 3.7283,
33134
+ "step": 5434
33135
+ },
33136
+ {
33137
+ "epoch": 0.04,
33138
+ "learning_rate": 0.0004,
33139
+ "loss": 7.4205,
33140
+ "step": 5435
33141
+ },
33142
+ {
33143
+ "epoch": 0.04,
33144
+ "learning_rate": 0.0004,
33145
+ "loss": 3.7002,
33146
+ "step": 5436
33147
+ },
33148
+ {
33149
+ "epoch": 0.04,
33150
+ "learning_rate": 0.0004,
33151
+ "loss": 3.32,
33152
+ "step": 5437
33153
+ },
33154
+ {
33155
+ "epoch": 0.04,
33156
+ "learning_rate": 0.0004,
33157
+ "loss": 7.5895,
33158
+ "step": 5438
33159
+ },
33160
+ {
33161
+ "epoch": 0.04,
33162
+ "learning_rate": 0.0004,
33163
+ "loss": 2.5335,
33164
+ "step": 5439
33165
+ },
33166
+ {
33167
+ "epoch": 0.04,
33168
+ "learning_rate": 0.0004,
33169
+ "loss": 2.5173,
33170
+ "step": 5440
33171
+ },
33172
+ {
33173
+ "epoch": 0.04,
33174
+ "learning_rate": 0.0004,
33175
+ "loss": 7.3841,
33176
+ "step": 5441
33177
+ },
33178
+ {
33179
+ "epoch": 0.04,
33180
+ "learning_rate": 0.0004,
33181
+ "loss": 6.1075,
33182
+ "step": 5442
33183
+ },
33184
+ {
33185
+ "epoch": 0.04,
33186
+ "learning_rate": 0.0004,
33187
+ "loss": 8.4645,
33188
+ "step": 5443
33189
+ },
33190
+ {
33191
+ "epoch": 0.04,
33192
+ "learning_rate": 0.0004,
33193
+ "loss": 2.5685,
33194
+ "step": 5444
33195
+ },
33196
+ {
33197
+ "epoch": 0.04,
33198
+ "learning_rate": 0.0004,
33199
+ "loss": 3.2423,
33200
+ "step": 5445
33201
+ },
33202
+ {
33203
+ "epoch": 0.04,
33204
+ "learning_rate": 0.0004,
33205
+ "loss": 8.3062,
33206
+ "step": 5446
33207
+ },
33208
+ {
33209
+ "epoch": 0.04,
33210
+ "learning_rate": 0.0004,
33211
+ "loss": 4.195,
33212
+ "step": 5447
33213
+ },
33214
+ {
33215
+ "epoch": 0.04,
33216
+ "learning_rate": 0.0004,
33217
+ "loss": 4.1215,
33218
+ "step": 5448
33219
+ },
33220
+ {
33221
+ "epoch": 0.04,
33222
+ "learning_rate": 0.0004,
33223
+ "loss": 3.7096,
33224
+ "step": 5449
33225
+ },
33226
+ {
33227
+ "epoch": 0.04,
33228
+ "learning_rate": 0.0004,
33229
+ "loss": 4.8696,
33230
+ "step": 5450
33231
+ },
33232
+ {
33233
+ "epoch": 0.04,
33234
+ "learning_rate": 0.0004,
33235
+ "loss": 3.0856,
33236
+ "step": 5451
33237
+ },
33238
+ {
33239
+ "epoch": 0.04,
33240
+ "learning_rate": 0.0004,
33241
+ "loss": 8.131,
33242
+ "step": 5452
33243
+ },
33244
+ {
33245
+ "epoch": 0.04,
33246
+ "learning_rate": 0.0004,
33247
+ "loss": 6.9663,
33248
+ "step": 5453
33249
+ },
33250
+ {
33251
+ "epoch": 0.04,
33252
+ "learning_rate": 0.0004,
33253
+ "loss": 6.8655,
33254
+ "step": 5454
33255
+ },
33256
+ {
33257
+ "epoch": 0.04,
33258
+ "learning_rate": 0.0004,
33259
+ "loss": 7.4411,
33260
+ "step": 5455
33261
+ },
33262
+ {
33263
+ "epoch": 0.04,
33264
+ "learning_rate": 0.0004,
33265
+ "loss": 7.33,
33266
+ "step": 5456
33267
+ },
33268
+ {
33269
+ "epoch": 0.04,
33270
+ "learning_rate": 0.0004,
33271
+ "loss": 7.4933,
33272
+ "step": 5457
33273
+ },
33274
+ {
33275
+ "epoch": 0.04,
33276
+ "learning_rate": 0.0004,
33277
+ "loss": 3.6501,
33278
+ "step": 5458
33279
+ },
33280
+ {
33281
+ "epoch": 0.04,
33282
+ "learning_rate": 0.0004,
33283
+ "loss": 6.9743,
33284
+ "step": 5459
33285
+ },
33286
+ {
33287
+ "epoch": 0.04,
33288
+ "learning_rate": 0.0004,
33289
+ "loss": 9.029,
33290
+ "step": 5460
33291
+ },
33292
+ {
33293
+ "epoch": 0.04,
33294
+ "learning_rate": 0.0004,
33295
+ "loss": 6.8042,
33296
+ "step": 5461
33297
+ },
33298
+ {
33299
+ "epoch": 0.04,
33300
+ "learning_rate": 0.0004,
33301
+ "loss": 10.3662,
33302
+ "step": 5462
33303
+ },
33304
+ {
33305
+ "epoch": 0.04,
33306
+ "learning_rate": 0.0004,
33307
+ "loss": 6.5808,
33308
+ "step": 5463
33309
+ },
33310
+ {
33311
+ "epoch": 0.04,
33312
+ "learning_rate": 0.0004,
33313
+ "loss": 7.422,
33314
+ "step": 5464
33315
+ },
33316
+ {
33317
+ "epoch": 0.04,
33318
+ "learning_rate": 0.0004,
33319
+ "loss": 7.5076,
33320
+ "step": 5465
33321
+ },
33322
+ {
33323
+ "epoch": 0.04,
33324
+ "learning_rate": 0.0004,
33325
+ "loss": 6.0352,
33326
+ "step": 5466
33327
+ },
33328
+ {
33329
+ "epoch": 0.04,
33330
+ "learning_rate": 0.0004,
33331
+ "loss": 2.7639,
33332
+ "step": 5467
33333
+ },
33334
+ {
33335
+ "epoch": 0.04,
33336
+ "learning_rate": 0.0004,
33337
+ "loss": 4.9244,
33338
+ "step": 5468
33339
+ },
33340
+ {
33341
+ "epoch": 0.04,
33342
+ "learning_rate": 0.0004,
33343
+ "loss": 6.7408,
33344
+ "step": 5469
33345
+ },
33346
+ {
33347
+ "epoch": 0.04,
33348
+ "learning_rate": 0.0004,
33349
+ "loss": 4.7444,
33350
+ "step": 5470
33351
+ },
33352
+ {
33353
+ "epoch": 0.04,
33354
+ "learning_rate": 0.0004,
33355
+ "loss": 8.3459,
33356
+ "step": 5471
33357
+ },
33358
+ {
33359
+ "epoch": 0.04,
33360
+ "learning_rate": 0.0004,
33361
+ "loss": 6.9678,
33362
+ "step": 5472
33363
+ },
33364
+ {
33365
+ "epoch": 0.04,
33366
+ "learning_rate": 0.0004,
33367
+ "loss": 8.1263,
33368
+ "step": 5473
33369
+ },
33370
+ {
33371
+ "epoch": 0.04,
33372
+ "learning_rate": 0.0004,
33373
+ "loss": 6.1176,
33374
+ "step": 5474
33375
+ },
33376
+ {
33377
+ "epoch": 0.04,
33378
+ "learning_rate": 0.0004,
33379
+ "loss": 5.2127,
33380
+ "step": 5475
33381
+ },
33382
+ {
33383
+ "epoch": 0.04,
33384
+ "learning_rate": 0.0004,
33385
+ "loss": 3.1435,
33386
+ "step": 5476
33387
+ },
33388
+ {
33389
+ "epoch": 0.04,
33390
+ "learning_rate": 0.0004,
33391
+ "loss": 5.8836,
33392
+ "step": 5477
33393
+ },
33394
+ {
33395
+ "epoch": 0.04,
33396
+ "learning_rate": 0.0004,
33397
+ "loss": 2.7154,
33398
+ "step": 5478
33399
+ },
33400
+ {
33401
+ "epoch": 0.04,
33402
+ "learning_rate": 0.0004,
33403
+ "loss": 7.6181,
33404
+ "step": 5479
33405
+ },
33406
+ {
33407
+ "epoch": 0.04,
33408
+ "learning_rate": 0.0004,
33409
+ "loss": 3.5132,
33410
+ "step": 5480
33411
+ },
33412
+ {
33413
+ "epoch": 0.04,
33414
+ "learning_rate": 0.0004,
33415
+ "loss": 5.9472,
33416
+ "step": 5481
33417
+ },
33418
+ {
33419
+ "epoch": 0.04,
33420
+ "learning_rate": 0.0004,
33421
+ "loss": 2.7316,
33422
+ "step": 5482
33423
+ },
33424
+ {
33425
+ "epoch": 0.04,
33426
+ "learning_rate": 0.0004,
33427
+ "loss": 4.4713,
33428
+ "step": 5483
33429
+ },
33430
+ {
33431
+ "epoch": 0.04,
33432
+ "learning_rate": 0.0004,
33433
+ "loss": 8.4081,
33434
+ "step": 5484
33435
+ },
33436
+ {
33437
+ "epoch": 0.04,
33438
+ "learning_rate": 0.0004,
33439
+ "loss": 2.5906,
33440
+ "step": 5485
33441
+ },
33442
+ {
33443
+ "epoch": 0.04,
33444
+ "learning_rate": 0.0004,
33445
+ "loss": 7.8309,
33446
+ "step": 5486
33447
+ },
33448
+ {
33449
+ "epoch": 0.04,
33450
+ "learning_rate": 0.0004,
33451
+ "loss": 2.5541,
33452
+ "step": 5487
33453
+ },
33454
+ {
33455
+ "epoch": 0.04,
33456
+ "learning_rate": 0.0004,
33457
+ "loss": 2.6686,
33458
+ "step": 5488
33459
+ },
33460
+ {
33461
+ "epoch": 0.04,
33462
+ "learning_rate": 0.0004,
33463
+ "loss": 2.5044,
33464
+ "step": 5489
33465
+ },
33466
+ {
33467
+ "epoch": 0.04,
33468
+ "learning_rate": 0.0004,
33469
+ "loss": 6.8598,
33470
+ "step": 5490
33471
+ },
33472
+ {
33473
+ "epoch": 0.04,
33474
+ "learning_rate": 0.0004,
33475
+ "loss": 8.1069,
33476
+ "step": 5491
33477
+ },
33478
+ {
33479
+ "epoch": 0.04,
33480
+ "learning_rate": 0.0004,
33481
+ "loss": 9.3975,
33482
+ "step": 5492
33483
+ },
33484
+ {
33485
+ "epoch": 0.04,
33486
+ "learning_rate": 0.0004,
33487
+ "loss": 6.7921,
33488
+ "step": 5493
33489
+ },
33490
+ {
33491
+ "epoch": 0.04,
33492
+ "learning_rate": 0.0004,
33493
+ "loss": 5.8833,
33494
+ "step": 5494
33495
+ },
33496
+ {
33497
+ "epoch": 0.04,
33498
+ "learning_rate": 0.0004,
33499
+ "loss": 5.4129,
33500
+ "step": 5495
33501
+ },
33502
+ {
33503
+ "epoch": 0.04,
33504
+ "learning_rate": 0.0004,
33505
+ "loss": 5.6771,
33506
+ "step": 5496
33507
+ },
33508
+ {
33509
+ "epoch": 0.04,
33510
+ "learning_rate": 0.0004,
33511
+ "loss": 6.3949,
33512
+ "step": 5497
33513
+ },
33514
+ {
33515
+ "epoch": 0.04,
33516
+ "learning_rate": 0.0004,
33517
+ "loss": 7.5032,
33518
+ "step": 5498
33519
+ },
33520
+ {
33521
+ "epoch": 0.04,
33522
+ "learning_rate": 0.0004,
33523
+ "loss": 2.963,
33524
+ "step": 5499
33525
+ },
33526
+ {
33527
+ "epoch": 0.04,
33528
+ "learning_rate": 0.0004,
33529
+ "loss": 3.4149,
33530
+ "step": 5500
33531
+ },
33532
+ {
33533
+ "epoch": 0.04,
33534
+ "learning_rate": 0.0004,
33535
+ "loss": 4.0817,
33536
+ "step": 5501
33537
+ },
33538
+ {
33539
+ "epoch": 0.04,
33540
+ "learning_rate": 0.0004,
33541
+ "loss": 8.606,
33542
+ "step": 5502
33543
+ },
33544
+ {
33545
+ "epoch": 0.04,
33546
+ "learning_rate": 0.0004,
33547
+ "loss": 8.112,
33548
+ "step": 5503
33549
+ },
33550
+ {
33551
+ "epoch": 0.04,
33552
+ "learning_rate": 0.0004,
33553
+ "loss": 9.0323,
33554
+ "step": 5504
33555
+ },
33556
+ {
33557
+ "epoch": 0.04,
33558
+ "learning_rate": 0.0004,
33559
+ "loss": 5.0102,
33560
+ "step": 5505
33561
+ },
33562
+ {
33563
+ "epoch": 0.04,
33564
+ "learning_rate": 0.0004,
33565
+ "loss": 6.9195,
33566
+ "step": 5506
33567
+ },
33568
+ {
33569
+ "epoch": 0.04,
33570
+ "learning_rate": 0.0004,
33571
+ "loss": 8.0544,
33572
+ "step": 5507
33573
+ },
33574
+ {
33575
+ "epoch": 0.04,
33576
+ "learning_rate": 0.0004,
33577
+ "loss": 5.7615,
33578
+ "step": 5508
33579
+ },
33580
+ {
33581
+ "epoch": 0.04,
33582
+ "learning_rate": 0.0004,
33583
+ "loss": 6.9108,
33584
+ "step": 5509
33585
+ },
33586
+ {
33587
+ "epoch": 0.04,
33588
+ "learning_rate": 0.0004,
33589
+ "loss": 7.2846,
33590
+ "step": 5510
33591
+ },
33592
+ {
33593
+ "epoch": 0.04,
33594
+ "learning_rate": 0.0004,
33595
+ "loss": 8.1243,
33596
+ "step": 5511
33597
+ },
33598
+ {
33599
+ "epoch": 0.04,
33600
+ "learning_rate": 0.0004,
33601
+ "loss": 7.0416,
33602
+ "step": 5512
33603
+ },
33604
+ {
33605
+ "epoch": 0.04,
33606
+ "learning_rate": 0.0004,
33607
+ "loss": 5.9334,
33608
+ "step": 5513
33609
+ },
33610
+ {
33611
+ "epoch": 0.04,
33612
+ "learning_rate": 0.0004,
33613
+ "loss": 6.7127,
33614
+ "step": 5514
33615
+ },
33616
+ {
33617
+ "epoch": 0.04,
33618
+ "learning_rate": 0.0004,
33619
+ "loss": 3.2506,
33620
+ "step": 5515
33621
+ },
33622
+ {
33623
+ "epoch": 0.04,
33624
+ "learning_rate": 0.0004,
33625
+ "loss": 9.5912,
33626
+ "step": 5516
33627
+ },
33628
+ {
33629
+ "epoch": 0.04,
33630
+ "learning_rate": 0.0004,
33631
+ "loss": 3.1955,
33632
+ "step": 5517
33633
+ },
33634
+ {
33635
+ "epoch": 0.04,
33636
+ "learning_rate": 0.0004,
33637
+ "loss": 5.3704,
33638
+ "step": 5518
33639
+ },
33640
+ {
33641
+ "epoch": 0.04,
33642
+ "learning_rate": 0.0004,
33643
+ "loss": 5.775,
33644
+ "step": 5519
33645
+ },
33646
+ {
33647
+ "epoch": 0.04,
33648
+ "learning_rate": 0.0004,
33649
+ "loss": 5.4993,
33650
+ "step": 5520
33651
+ },
33652
+ {
33653
+ "epoch": 0.04,
33654
+ "learning_rate": 0.0004,
33655
+ "loss": 8.1517,
33656
+ "step": 5521
33657
+ },
33658
+ {
33659
+ "epoch": 0.04,
33660
+ "learning_rate": 0.0004,
33661
+ "loss": 6.7803,
33662
+ "step": 5522
33663
+ },
33664
+ {
33665
+ "epoch": 0.04,
33666
+ "learning_rate": 0.0004,
33667
+ "loss": 5.2405,
33668
+ "step": 5523
33669
+ },
33670
+ {
33671
+ "epoch": 0.04,
33672
+ "learning_rate": 0.0004,
33673
+ "loss": 3.6089,
33674
+ "step": 5524
33675
+ },
33676
+ {
33677
+ "epoch": 0.04,
33678
+ "learning_rate": 0.0004,
33679
+ "loss": 6.3463,
33680
+ "step": 5525
33681
+ },
33682
+ {
33683
+ "epoch": 0.04,
33684
+ "learning_rate": 0.0004,
33685
+ "loss": 8.8214,
33686
+ "step": 5526
33687
+ },
33688
+ {
33689
+ "epoch": 0.04,
33690
+ "learning_rate": 0.0004,
33691
+ "loss": 7.0789,
33692
+ "step": 5527
33693
+ },
33694
+ {
33695
+ "epoch": 0.04,
33696
+ "learning_rate": 0.0004,
33697
+ "loss": 4.0443,
33698
+ "step": 5528
33699
+ },
33700
+ {
33701
+ "epoch": 0.04,
33702
+ "learning_rate": 0.0004,
33703
+ "loss": 2.9387,
33704
+ "step": 5529
33705
+ },
33706
+ {
33707
+ "epoch": 0.04,
33708
+ "learning_rate": 0.0004,
33709
+ "loss": 3.3787,
33710
+ "step": 5530
33711
+ },
33712
+ {
33713
+ "epoch": 0.04,
33714
+ "learning_rate": 0.0004,
33715
+ "loss": 3.2718,
33716
+ "step": 5531
33717
+ },
33718
+ {
33719
+ "epoch": 0.04,
33720
+ "learning_rate": 0.0004,
33721
+ "loss": 7.1476,
33722
+ "step": 5532
33723
+ },
33724
+ {
33725
+ "epoch": 0.04,
33726
+ "learning_rate": 0.0004,
33727
+ "loss": 3.1862,
33728
+ "step": 5533
33729
+ },
33730
+ {
33731
+ "epoch": 0.04,
33732
+ "learning_rate": 0.0004,
33733
+ "loss": 7.9094,
33734
+ "step": 5534
33735
+ },
33736
+ {
33737
+ "epoch": 0.04,
33738
+ "learning_rate": 0.0004,
33739
+ "loss": 2.6915,
33740
+ "step": 5535
33741
+ },
33742
+ {
33743
+ "epoch": 0.04,
33744
+ "learning_rate": 0.0004,
33745
+ "loss": 6.4335,
33746
+ "step": 5536
33747
+ },
33748
+ {
33749
+ "epoch": 0.04,
33750
+ "learning_rate": 0.0004,
33751
+ "loss": 3.5026,
33752
+ "step": 5537
33753
+ },
33754
+ {
33755
+ "epoch": 0.04,
33756
+ "learning_rate": 0.0004,
33757
+ "loss": 3.4137,
33758
+ "step": 5538
33759
+ },
33760
+ {
33761
+ "epoch": 0.04,
33762
+ "learning_rate": 0.0004,
33763
+ "loss": 2.4747,
33764
+ "step": 5539
33765
+ },
33766
+ {
33767
+ "epoch": 0.04,
33768
+ "learning_rate": 0.0004,
33769
+ "loss": 2.8525,
33770
+ "step": 5540
33771
+ },
33772
+ {
33773
+ "epoch": 0.04,
33774
+ "learning_rate": 0.0004,
33775
+ "loss": 7.0023,
33776
+ "step": 5541
33777
+ },
33778
+ {
33779
+ "epoch": 0.04,
33780
+ "learning_rate": 0.0004,
33781
+ "loss": 6.7906,
33782
+ "step": 5542
33783
+ },
33784
+ {
33785
+ "epoch": 0.04,
33786
+ "learning_rate": 0.0004,
33787
+ "loss": 6.4716,
33788
+ "step": 5543
33789
+ },
33790
+ {
33791
+ "epoch": 0.04,
33792
+ "learning_rate": 0.0004,
33793
+ "loss": 2.437,
33794
+ "step": 5544
33795
+ },
33796
+ {
33797
+ "epoch": 0.04,
33798
+ "learning_rate": 0.0004,
33799
+ "loss": 2.1311,
33800
+ "step": 5545
33801
+ },
33802
+ {
33803
+ "epoch": 0.04,
33804
+ "learning_rate": 0.0004,
33805
+ "loss": 5.2937,
33806
+ "step": 5546
33807
+ },
33808
+ {
33809
+ "epoch": 0.04,
33810
+ "learning_rate": 0.0004,
33811
+ "loss": 8.6679,
33812
+ "step": 5547
33813
+ },
33814
+ {
33815
+ "epoch": 0.04,
33816
+ "learning_rate": 0.0004,
33817
+ "loss": 6.9048,
33818
+ "step": 5548
33819
+ },
33820
+ {
33821
+ "epoch": 0.04,
33822
+ "learning_rate": 0.0004,
33823
+ "loss": 3.9706,
33824
+ "step": 5549
33825
+ },
33826
+ {
33827
+ "epoch": 0.04,
33828
+ "learning_rate": 0.0004,
33829
+ "loss": 5.5848,
33830
+ "step": 5550
33831
+ },
33832
+ {
33833
+ "epoch": 0.04,
33834
+ "learning_rate": 0.0004,
33835
+ "loss": 9.3629,
33836
+ "step": 5551
33837
+ },
33838
+ {
33839
+ "epoch": 0.04,
33840
+ "learning_rate": 0.0004,
33841
+ "loss": 8.8409,
33842
+ "step": 5552
33843
+ },
33844
+ {
33845
+ "epoch": 0.04,
33846
+ "learning_rate": 0.0004,
33847
+ "loss": 7.8026,
33848
+ "step": 5553
33849
+ },
33850
+ {
33851
+ "epoch": 0.04,
33852
+ "learning_rate": 0.0004,
33853
+ "loss": 6.7644,
33854
+ "step": 5554
33855
+ },
33856
+ {
33857
+ "epoch": 0.04,
33858
+ "learning_rate": 0.0004,
33859
+ "loss": 7.3403,
33860
+ "step": 5555
33861
+ },
33862
+ {
33863
+ "epoch": 0.04,
33864
+ "learning_rate": 0.0004,
33865
+ "loss": 8.8025,
33866
+ "step": 5556
33867
+ },
33868
+ {
33869
+ "epoch": 0.04,
33870
+ "learning_rate": 0.0004,
33871
+ "loss": 7.1454,
33872
+ "step": 5557
33873
+ },
33874
+ {
33875
+ "epoch": 0.04,
33876
+ "learning_rate": 0.0004,
33877
+ "loss": 7.9874,
33878
+ "step": 5558
33879
+ },
33880
+ {
33881
+ "epoch": 0.04,
33882
+ "learning_rate": 0.0004,
33883
+ "loss": 7.5292,
33884
+ "step": 5559
33885
+ },
33886
+ {
33887
+ "epoch": 0.04,
33888
+ "learning_rate": 0.0004,
33889
+ "loss": 6.8845,
33890
+ "step": 5560
33891
+ },
33892
+ {
33893
+ "epoch": 0.04,
33894
+ "learning_rate": 0.0004,
33895
+ "loss": 5.9432,
33896
+ "step": 5561
33897
+ },
33898
+ {
33899
+ "epoch": 0.04,
33900
+ "learning_rate": 0.0004,
33901
+ "loss": 5.5414,
33902
+ "step": 5562
33903
+ },
33904
+ {
33905
+ "epoch": 0.04,
33906
+ "learning_rate": 0.0004,
33907
+ "loss": 7.7734,
33908
+ "step": 5563
33909
+ },
33910
+ {
33911
+ "epoch": 0.04,
33912
+ "learning_rate": 0.0004,
33913
+ "loss": 4.0763,
33914
+ "step": 5564
33915
+ },
33916
+ {
33917
+ "epoch": 0.04,
33918
+ "learning_rate": 0.0004,
33919
+ "loss": 6.5921,
33920
+ "step": 5565
33921
+ },
33922
+ {
33923
+ "epoch": 0.04,
33924
+ "learning_rate": 0.0004,
33925
+ "loss": 6.3899,
33926
+ "step": 5566
33927
+ },
33928
+ {
33929
+ "epoch": 0.04,
33930
+ "learning_rate": 0.0004,
33931
+ "loss": 8.175,
33932
+ "step": 5567
33933
+ },
33934
+ {
33935
+ "epoch": 0.04,
33936
+ "learning_rate": 0.0004,
33937
+ "loss": 6.3888,
33938
+ "step": 5568
33939
+ },
33940
+ {
33941
+ "epoch": 0.04,
33942
+ "learning_rate": 0.0004,
33943
+ "loss": 6.1035,
33944
+ "step": 5569
33945
+ },
33946
+ {
33947
+ "epoch": 0.04,
33948
+ "learning_rate": 0.0004,
33949
+ "loss": 7.9351,
33950
+ "step": 5570
33951
+ },
33952
+ {
33953
+ "epoch": 0.04,
33954
+ "learning_rate": 0.0004,
33955
+ "loss": 3.6726,
33956
+ "step": 5571
33957
+ },
33958
+ {
33959
+ "epoch": 0.04,
33960
+ "learning_rate": 0.0004,
33961
+ "loss": 10.7635,
33962
+ "step": 5572
33963
+ },
33964
+ {
33965
+ "epoch": 0.04,
33966
+ "learning_rate": 0.0004,
33967
+ "loss": 7.6501,
33968
+ "step": 5573
33969
+ },
33970
+ {
33971
+ "epoch": 0.04,
33972
+ "learning_rate": 0.0004,
33973
+ "loss": 8.5995,
33974
+ "step": 5574
33975
+ },
33976
+ {
33977
+ "epoch": 0.04,
33978
+ "learning_rate": 0.0004,
33979
+ "loss": 7.9299,
33980
+ "step": 5575
33981
+ },
33982
+ {
33983
+ "epoch": 0.04,
33984
+ "learning_rate": 0.0004,
33985
+ "loss": 7.6476,
33986
+ "step": 5576
33987
+ },
33988
+ {
33989
+ "epoch": 0.04,
33990
+ "learning_rate": 0.0004,
33991
+ "loss": 3.6735,
33992
+ "step": 5577
33993
+ },
33994
+ {
33995
+ "epoch": 0.04,
33996
+ "learning_rate": 0.0004,
33997
+ "loss": 3.5287,
33998
+ "step": 5578
33999
+ },
34000
+ {
34001
+ "epoch": 0.04,
34002
+ "learning_rate": 0.0004,
34003
+ "loss": 3.2667,
34004
+ "step": 5579
34005
+ },
34006
+ {
34007
+ "epoch": 0.04,
34008
+ "learning_rate": 0.0004,
34009
+ "loss": 4.7265,
34010
+ "step": 5580
34011
+ },
34012
+ {
34013
+ "epoch": 0.04,
34014
+ "learning_rate": 0.0004,
34015
+ "loss": 4.2115,
34016
+ "step": 5581
34017
+ },
34018
+ {
34019
+ "epoch": 0.04,
34020
+ "learning_rate": 0.0004,
34021
+ "loss": 5.5158,
34022
+ "step": 5582
34023
+ },
34024
+ {
34025
+ "epoch": 0.04,
34026
+ "learning_rate": 0.0004,
34027
+ "loss": 6.8307,
34028
+ "step": 5583
34029
+ },
34030
+ {
34031
+ "epoch": 0.04,
34032
+ "learning_rate": 0.0004,
34033
+ "loss": 4.8515,
34034
+ "step": 5584
34035
+ },
34036
+ {
34037
+ "epoch": 0.04,
34038
+ "learning_rate": 0.0004,
34039
+ "loss": 7.112,
34040
+ "step": 5585
34041
+ },
34042
+ {
34043
+ "epoch": 0.04,
34044
+ "learning_rate": 0.0004,
34045
+ "loss": 6.6433,
34046
+ "step": 5586
34047
+ },
34048
+ {
34049
+ "epoch": 0.04,
34050
+ "learning_rate": 0.0004,
34051
+ "loss": 7.7307,
34052
+ "step": 5587
34053
+ },
34054
+ {
34055
+ "epoch": 0.04,
34056
+ "learning_rate": 0.0004,
34057
+ "loss": 5.9845,
34058
+ "step": 5588
34059
+ },
34060
+ {
34061
+ "epoch": 0.04,
34062
+ "learning_rate": 0.0004,
34063
+ "loss": 7.029,
34064
+ "step": 5589
34065
+ },
34066
+ {
34067
+ "epoch": 0.04,
34068
+ "learning_rate": 0.0004,
34069
+ "loss": 4.3792,
34070
+ "step": 5590
34071
+ },
34072
+ {
34073
+ "epoch": 0.04,
34074
+ "learning_rate": 0.0004,
34075
+ "loss": 5.3602,
34076
+ "step": 5591
34077
+ },
34078
+ {
34079
+ "epoch": 0.04,
34080
+ "learning_rate": 0.0004,
34081
+ "loss": 3.9109,
34082
+ "step": 5592
34083
+ },
34084
+ {
34085
+ "epoch": 0.04,
34086
+ "learning_rate": 0.0004,
34087
+ "loss": 5.5172,
34088
+ "step": 5593
34089
+ },
34090
+ {
34091
+ "epoch": 0.04,
34092
+ "learning_rate": 0.0004,
34093
+ "loss": 3.9592,
34094
+ "step": 5594
34095
+ },
34096
+ {
34097
+ "epoch": 0.04,
34098
+ "learning_rate": 0.0004,
34099
+ "loss": 4.125,
34100
+ "step": 5595
34101
+ },
34102
+ {
34103
+ "epoch": 0.04,
34104
+ "learning_rate": 0.0004,
34105
+ "loss": 5.3487,
34106
+ "step": 5596
34107
+ },
34108
+ {
34109
+ "epoch": 0.04,
34110
+ "learning_rate": 0.0004,
34111
+ "loss": 6.7272,
34112
+ "step": 5597
34113
+ },
34114
+ {
34115
+ "epoch": 0.04,
34116
+ "learning_rate": 0.0004,
34117
+ "loss": 5.6483,
34118
+ "step": 5598
34119
+ },
34120
+ {
34121
+ "epoch": 0.04,
34122
+ "learning_rate": 0.0004,
34123
+ "loss": 2.811,
34124
+ "step": 5599
34125
+ },
34126
+ {
34127
+ "epoch": 0.04,
34128
+ "learning_rate": 0.0004,
34129
+ "loss": 7.2748,
34130
+ "step": 5600
34131
+ },
34132
+ {
34133
+ "epoch": 0.04,
34134
+ "eval_loss": 6.424686431884766,
34135
+ "eval_runtime": 22.4149,
34136
+ "eval_samples_per_second": 2.231,
34137
+ "eval_steps_per_second": 1.115,
34138
+ "step": 5600
34139
+ },
34140
+ {
34141
+ "epoch": 0.04,
34142
+ "mmlu_eval_accuracy": 0.2525477994227994,
34143
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
34144
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
34145
+ "mmlu_eval_accuracy_astronomy": 0.3125,
34146
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
34147
+ "mmlu_loss": 3.697023420333862,
34148
+ "step": 5600
34149
+ },
34150
+ {
34151
+ "epoch": 0.04,
34152
+ "learning_rate": 0.0004,
34153
+ "loss": 7.9876,
34154
+ "step": 5601
34155
+ },
34156
+ {
34157
+ "epoch": 0.04,
34158
+ "learning_rate": 0.0004,
34159
+ "loss": 4.0035,
34160
+ "step": 5602
34161
+ },
34162
+ {
34163
+ "epoch": 0.04,
34164
+ "learning_rate": 0.0004,
34165
+ "loss": 9.1667,
34166
+ "step": 5603
34167
+ },
34168
+ {
34169
+ "epoch": 0.04,
34170
+ "learning_rate": 0.0004,
34171
+ "loss": 3.504,
34172
+ "step": 5604
34173
+ },
34174
+ {
34175
+ "epoch": 0.04,
34176
+ "learning_rate": 0.0004,
34177
+ "loss": 8.1787,
34178
+ "step": 5605
34179
+ },
34180
+ {
34181
+ "epoch": 0.04,
34182
+ "learning_rate": 0.0004,
34183
+ "loss": 5.9815,
34184
+ "step": 5606
34185
+ },
34186
+ {
34187
+ "epoch": 0.04,
34188
+ "learning_rate": 0.0004,
34189
+ "loss": 7.3385,
34190
+ "step": 5607
34191
+ },
34192
+ {
34193
+ "epoch": 0.04,
34194
+ "learning_rate": 0.0004,
34195
+ "loss": 8.861,
34196
+ "step": 5608
34197
+ },
34198
+ {
34199
+ "epoch": 0.04,
34200
+ "learning_rate": 0.0004,
34201
+ "loss": 7.4176,
34202
+ "step": 5609
34203
+ },
34204
+ {
34205
+ "epoch": 0.04,
34206
+ "learning_rate": 0.0004,
34207
+ "loss": 8.1611,
34208
+ "step": 5610
34209
+ },
34210
+ {
34211
+ "epoch": 0.04,
34212
+ "learning_rate": 0.0004,
34213
+ "loss": 4.066,
34214
+ "step": 5611
34215
+ },
34216
+ {
34217
+ "epoch": 0.04,
34218
+ "learning_rate": 0.0004,
34219
+ "loss": 3.022,
34220
+ "step": 5612
34221
+ },
34222
+ {
34223
+ "epoch": 0.04,
34224
+ "learning_rate": 0.0004,
34225
+ "loss": 6.3958,
34226
+ "step": 5613
34227
+ },
34228
+ {
34229
+ "epoch": 0.04,
34230
+ "learning_rate": 0.0004,
34231
+ "loss": 7.3849,
34232
+ "step": 5614
34233
+ },
34234
+ {
34235
+ "epoch": 0.04,
34236
+ "learning_rate": 0.0004,
34237
+ "loss": 6.97,
34238
+ "step": 5615
34239
+ },
34240
+ {
34241
+ "epoch": 0.04,
34242
+ "learning_rate": 0.0004,
34243
+ "loss": 6.9633,
34244
+ "step": 5616
34245
+ },
34246
+ {
34247
+ "epoch": 0.04,
34248
+ "learning_rate": 0.0004,
34249
+ "loss": 6.1699,
34250
+ "step": 5617
34251
+ },
34252
+ {
34253
+ "epoch": 0.04,
34254
+ "learning_rate": 0.0004,
34255
+ "loss": 5.2956,
34256
+ "step": 5618
34257
+ },
34258
+ {
34259
+ "epoch": 0.04,
34260
+ "learning_rate": 0.0004,
34261
+ "loss": 6.3328,
34262
+ "step": 5619
34263
+ },
34264
+ {
34265
+ "epoch": 0.04,
34266
+ "learning_rate": 0.0004,
34267
+ "loss": 6.392,
34268
+ "step": 5620
34269
+ },
34270
+ {
34271
+ "epoch": 0.04,
34272
+ "learning_rate": 0.0004,
34273
+ "loss": 3.0549,
34274
+ "step": 5621
34275
+ },
34276
+ {
34277
+ "epoch": 0.04,
34278
+ "learning_rate": 0.0004,
34279
+ "loss": 3.0383,
34280
+ "step": 5622
34281
+ },
34282
+ {
34283
+ "epoch": 0.04,
34284
+ "learning_rate": 0.0004,
34285
+ "loss": 5.7526,
34286
+ "step": 5623
34287
+ },
34288
+ {
34289
+ "epoch": 0.04,
34290
+ "learning_rate": 0.0004,
34291
+ "loss": 8.3642,
34292
+ "step": 5624
34293
+ },
34294
+ {
34295
+ "epoch": 0.04,
34296
+ "learning_rate": 0.0004,
34297
+ "loss": 7.4075,
34298
+ "step": 5625
34299
+ },
34300
+ {
34301
+ "epoch": 0.04,
34302
+ "learning_rate": 0.0004,
34303
+ "loss": 5.4872,
34304
+ "step": 5626
34305
+ },
34306
+ {
34307
+ "epoch": 0.04,
34308
+ "learning_rate": 0.0004,
34309
+ "loss": 5.468,
34310
+ "step": 5627
34311
+ },
34312
+ {
34313
+ "epoch": 0.04,
34314
+ "learning_rate": 0.0004,
34315
+ "loss": 7.0869,
34316
+ "step": 5628
34317
+ },
34318
+ {
34319
+ "epoch": 0.04,
34320
+ "learning_rate": 0.0004,
34321
+ "loss": 5.0191,
34322
+ "step": 5629
34323
+ },
34324
+ {
34325
+ "epoch": 0.04,
34326
+ "learning_rate": 0.0004,
34327
+ "loss": 6.4106,
34328
+ "step": 5630
34329
+ },
34330
+ {
34331
+ "epoch": 0.04,
34332
+ "learning_rate": 0.0004,
34333
+ "loss": 3.9285,
34334
+ "step": 5631
34335
+ },
34336
+ {
34337
+ "epoch": 0.04,
34338
+ "learning_rate": 0.0004,
34339
+ "loss": 6.4914,
34340
+ "step": 5632
34341
+ },
34342
+ {
34343
+ "epoch": 0.04,
34344
+ "learning_rate": 0.0004,
34345
+ "loss": 6.6292,
34346
+ "step": 5633
34347
+ },
34348
+ {
34349
+ "epoch": 0.04,
34350
+ "learning_rate": 0.0004,
34351
+ "loss": 8.1575,
34352
+ "step": 5634
34353
+ },
34354
+ {
34355
+ "epoch": 0.04,
34356
+ "learning_rate": 0.0004,
34357
+ "loss": 5.2383,
34358
+ "step": 5635
34359
+ },
34360
+ {
34361
+ "epoch": 0.04,
34362
+ "learning_rate": 0.0004,
34363
+ "loss": 7.0047,
34364
+ "step": 5636
34365
+ },
34366
+ {
34367
+ "epoch": 0.04,
34368
+ "learning_rate": 0.0004,
34369
+ "loss": 8.2193,
34370
+ "step": 5637
34371
+ },
34372
+ {
34373
+ "epoch": 0.04,
34374
+ "learning_rate": 0.0004,
34375
+ "loss": 2.6003,
34376
+ "step": 5638
34377
+ },
34378
+ {
34379
+ "epoch": 0.04,
34380
+ "learning_rate": 0.0004,
34381
+ "loss": 5.394,
34382
+ "step": 5639
34383
+ },
34384
+ {
34385
+ "epoch": 0.04,
34386
+ "learning_rate": 0.0004,
34387
+ "loss": 2.5271,
34388
+ "step": 5640
34389
+ },
34390
+ {
34391
+ "epoch": 0.04,
34392
+ "learning_rate": 0.0004,
34393
+ "loss": 3.5289,
34394
+ "step": 5641
34395
+ },
34396
+ {
34397
+ "epoch": 0.04,
34398
+ "learning_rate": 0.0004,
34399
+ "loss": 5.5975,
34400
+ "step": 5642
34401
+ },
34402
+ {
34403
+ "epoch": 0.04,
34404
+ "learning_rate": 0.0004,
34405
+ "loss": 4.8294,
34406
+ "step": 5643
34407
+ },
34408
+ {
34409
+ "epoch": 0.04,
34410
+ "learning_rate": 0.0004,
34411
+ "loss": 4.3174,
34412
+ "step": 5644
34413
+ },
34414
+ {
34415
+ "epoch": 0.04,
34416
+ "learning_rate": 0.0004,
34417
+ "loss": 6.1406,
34418
+ "step": 5645
34419
+ },
34420
+ {
34421
+ "epoch": 0.04,
34422
+ "learning_rate": 0.0004,
34423
+ "loss": 3.8035,
34424
+ "step": 5646
34425
+ },
34426
+ {
34427
+ "epoch": 0.04,
34428
+ "learning_rate": 0.0004,
34429
+ "loss": 4.709,
34430
+ "step": 5647
34431
+ },
34432
+ {
34433
+ "epoch": 0.04,
34434
+ "learning_rate": 0.0004,
34435
+ "loss": 6.5739,
34436
+ "step": 5648
34437
+ },
34438
+ {
34439
+ "epoch": 0.04,
34440
+ "learning_rate": 0.0004,
34441
+ "loss": 2.2544,
34442
+ "step": 5649
34443
+ },
34444
+ {
34445
+ "epoch": 0.04,
34446
+ "learning_rate": 0.0004,
34447
+ "loss": 2.8751,
34448
+ "step": 5650
34449
+ },
34450
+ {
34451
+ "epoch": 0.04,
34452
+ "learning_rate": 0.0004,
34453
+ "loss": 8.6977,
34454
+ "step": 5651
34455
+ },
34456
+ {
34457
+ "epoch": 0.04,
34458
+ "learning_rate": 0.0004,
34459
+ "loss": 7.6419,
34460
+ "step": 5652
34461
+ },
34462
+ {
34463
+ "epoch": 0.04,
34464
+ "learning_rate": 0.0004,
34465
+ "loss": 6.7223,
34466
+ "step": 5653
34467
+ },
34468
+ {
34469
+ "epoch": 0.04,
34470
+ "learning_rate": 0.0004,
34471
+ "loss": 7.1536,
34472
+ "step": 5654
34473
+ },
34474
+ {
34475
+ "epoch": 0.04,
34476
+ "learning_rate": 0.0004,
34477
+ "loss": 7.181,
34478
+ "step": 5655
34479
+ },
34480
+ {
34481
+ "epoch": 0.04,
34482
+ "learning_rate": 0.0004,
34483
+ "loss": 6.9262,
34484
+ "step": 5656
34485
+ },
34486
+ {
34487
+ "epoch": 0.04,
34488
+ "learning_rate": 0.0004,
34489
+ "loss": 6.5811,
34490
+ "step": 5657
34491
+ },
34492
+ {
34493
+ "epoch": 0.04,
34494
+ "learning_rate": 0.0004,
34495
+ "loss": 5.4126,
34496
+ "step": 5658
34497
+ },
34498
+ {
34499
+ "epoch": 0.04,
34500
+ "learning_rate": 0.0004,
34501
+ "loss": 3.8075,
34502
+ "step": 5659
34503
+ },
34504
+ {
34505
+ "epoch": 0.04,
34506
+ "learning_rate": 0.0004,
34507
+ "loss": 8.6973,
34508
+ "step": 5660
34509
+ },
34510
+ {
34511
+ "epoch": 0.04,
34512
+ "learning_rate": 0.0004,
34513
+ "loss": 6.8127,
34514
+ "step": 5661
34515
+ },
34516
+ {
34517
+ "epoch": 0.04,
34518
+ "learning_rate": 0.0004,
34519
+ "loss": 6.5205,
34520
+ "step": 5662
34521
+ },
34522
+ {
34523
+ "epoch": 0.04,
34524
+ "learning_rate": 0.0004,
34525
+ "loss": 7.7184,
34526
+ "step": 5663
34527
+ },
34528
+ {
34529
+ "epoch": 0.04,
34530
+ "learning_rate": 0.0004,
34531
+ "loss": 2.6631,
34532
+ "step": 5664
34533
+ },
34534
+ {
34535
+ "epoch": 0.04,
34536
+ "learning_rate": 0.0004,
34537
+ "loss": 8.0133,
34538
+ "step": 5665
34539
+ },
34540
+ {
34541
+ "epoch": 0.04,
34542
+ "learning_rate": 0.0004,
34543
+ "loss": 8.4575,
34544
+ "step": 5666
34545
+ },
34546
+ {
34547
+ "epoch": 0.04,
34548
+ "learning_rate": 0.0004,
34549
+ "loss": 7.2522,
34550
+ "step": 5667
34551
+ },
34552
+ {
34553
+ "epoch": 0.04,
34554
+ "learning_rate": 0.0004,
34555
+ "loss": 7.638,
34556
+ "step": 5668
34557
+ },
34558
+ {
34559
+ "epoch": 0.04,
34560
+ "learning_rate": 0.0004,
34561
+ "loss": 5.5406,
34562
+ "step": 5669
34563
+ },
34564
+ {
34565
+ "epoch": 0.04,
34566
+ "learning_rate": 0.0004,
34567
+ "loss": 6.9613,
34568
+ "step": 5670
34569
+ },
34570
+ {
34571
+ "epoch": 0.04,
34572
+ "learning_rate": 0.0004,
34573
+ "loss": 7.8844,
34574
+ "step": 5671
34575
+ },
34576
+ {
34577
+ "epoch": 0.04,
34578
+ "learning_rate": 0.0004,
34579
+ "loss": 7.2222,
34580
+ "step": 5672
34581
+ },
34582
+ {
34583
+ "epoch": 0.04,
34584
+ "learning_rate": 0.0004,
34585
+ "loss": 5.2324,
34586
+ "step": 5673
34587
+ },
34588
+ {
34589
+ "epoch": 0.04,
34590
+ "learning_rate": 0.0004,
34591
+ "loss": 2.9769,
34592
+ "step": 5674
34593
+ },
34594
+ {
34595
+ "epoch": 0.04,
34596
+ "learning_rate": 0.0004,
34597
+ "loss": 5.4176,
34598
+ "step": 5675
34599
+ },
34600
+ {
34601
+ "epoch": 0.04,
34602
+ "learning_rate": 0.0004,
34603
+ "loss": 3.5889,
34604
+ "step": 5676
34605
+ },
34606
+ {
34607
+ "epoch": 0.04,
34608
+ "learning_rate": 0.0004,
34609
+ "loss": 6.2919,
34610
+ "step": 5677
34611
+ },
34612
+ {
34613
+ "epoch": 0.04,
34614
+ "learning_rate": 0.0004,
34615
+ "loss": 6.9505,
34616
+ "step": 5678
34617
+ },
34618
+ {
34619
+ "epoch": 0.04,
34620
+ "learning_rate": 0.0004,
34621
+ "loss": 5.694,
34622
+ "step": 5679
34623
+ },
34624
+ {
34625
+ "epoch": 0.04,
34626
+ "learning_rate": 0.0004,
34627
+ "loss": 6.5429,
34628
+ "step": 5680
34629
+ },
34630
+ {
34631
+ "epoch": 0.04,
34632
+ "learning_rate": 0.0004,
34633
+ "loss": 5.5205,
34634
+ "step": 5681
34635
+ },
34636
+ {
34637
+ "epoch": 0.04,
34638
+ "learning_rate": 0.0004,
34639
+ "loss": 6.5445,
34640
+ "step": 5682
34641
+ },
34642
+ {
34643
+ "epoch": 0.04,
34644
+ "learning_rate": 0.0004,
34645
+ "loss": 7.3609,
34646
+ "step": 5683
34647
+ },
34648
+ {
34649
+ "epoch": 0.04,
34650
+ "learning_rate": 0.0004,
34651
+ "loss": 7.3904,
34652
+ "step": 5684
34653
+ },
34654
+ {
34655
+ "epoch": 0.04,
34656
+ "learning_rate": 0.0004,
34657
+ "loss": 5.3472,
34658
+ "step": 5685
34659
+ },
34660
+ {
34661
+ "epoch": 0.04,
34662
+ "learning_rate": 0.0004,
34663
+ "loss": 4.7564,
34664
+ "step": 5686
34665
+ },
34666
+ {
34667
+ "epoch": 0.04,
34668
+ "learning_rate": 0.0004,
34669
+ "loss": 6.312,
34670
+ "step": 5687
34671
+ },
34672
+ {
34673
+ "epoch": 0.04,
34674
+ "learning_rate": 0.0004,
34675
+ "loss": 5.4367,
34676
+ "step": 5688
34677
+ },
34678
+ {
34679
+ "epoch": 0.04,
34680
+ "learning_rate": 0.0004,
34681
+ "loss": 8.4472,
34682
+ "step": 5689
34683
+ },
34684
+ {
34685
+ "epoch": 0.04,
34686
+ "learning_rate": 0.0004,
34687
+ "loss": 5.8272,
34688
+ "step": 5690
34689
+ },
34690
+ {
34691
+ "epoch": 0.04,
34692
+ "learning_rate": 0.0004,
34693
+ "loss": 5.2634,
34694
+ "step": 5691
34695
+ },
34696
+ {
34697
+ "epoch": 0.04,
34698
+ "learning_rate": 0.0004,
34699
+ "loss": 3.2939,
34700
+ "step": 5692
34701
+ },
34702
+ {
34703
+ "epoch": 0.04,
34704
+ "learning_rate": 0.0004,
34705
+ "loss": 2.657,
34706
+ "step": 5693
34707
+ },
34708
+ {
34709
+ "epoch": 0.04,
34710
+ "learning_rate": 0.0004,
34711
+ "loss": 3.1746,
34712
+ "step": 5694
34713
+ },
34714
+ {
34715
+ "epoch": 0.04,
34716
+ "learning_rate": 0.0004,
34717
+ "loss": 3.8332,
34718
+ "step": 5695
34719
+ },
34720
+ {
34721
+ "epoch": 0.04,
34722
+ "learning_rate": 0.0004,
34723
+ "loss": 4.94,
34724
+ "step": 5696
34725
+ },
34726
+ {
34727
+ "epoch": 0.04,
34728
+ "learning_rate": 0.0004,
34729
+ "loss": 6.7484,
34730
+ "step": 5697
34731
+ },
34732
+ {
34733
+ "epoch": 0.04,
34734
+ "learning_rate": 0.0004,
34735
+ "loss": 5.5731,
34736
+ "step": 5698
34737
+ },
34738
+ {
34739
+ "epoch": 0.04,
34740
+ "learning_rate": 0.0004,
34741
+ "loss": 2.7432,
34742
+ "step": 5699
34743
+ },
34744
+ {
34745
+ "epoch": 0.04,
34746
+ "learning_rate": 0.0004,
34747
+ "loss": 2.3457,
34748
+ "step": 5700
34749
+ },
34750
+ {
34751
+ "epoch": 0.04,
34752
+ "learning_rate": 0.0004,
34753
+ "loss": 7.3142,
34754
+ "step": 5701
34755
+ },
34756
+ {
34757
+ "epoch": 0.04,
34758
+ "learning_rate": 0.0004,
34759
+ "loss": 8.6531,
34760
+ "step": 5702
34761
+ },
34762
+ {
34763
+ "epoch": 0.04,
34764
+ "learning_rate": 0.0004,
34765
+ "loss": 8.9737,
34766
+ "step": 5703
34767
+ },
34768
+ {
34769
+ "epoch": 0.04,
34770
+ "learning_rate": 0.0004,
34771
+ "loss": 5.6196,
34772
+ "step": 5704
34773
+ },
34774
+ {
34775
+ "epoch": 0.04,
34776
+ "learning_rate": 0.0004,
34777
+ "loss": 4.8655,
34778
+ "step": 5705
34779
+ },
34780
+ {
34781
+ "epoch": 0.04,
34782
+ "learning_rate": 0.0004,
34783
+ "loss": 3.082,
34784
+ "step": 5706
34785
+ },
34786
+ {
34787
+ "epoch": 0.04,
34788
+ "learning_rate": 0.0004,
34789
+ "loss": 3.3827,
34790
+ "step": 5707
34791
+ },
34792
+ {
34793
+ "epoch": 0.04,
34794
+ "learning_rate": 0.0004,
34795
+ "loss": 6.4305,
34796
+ "step": 5708
34797
+ },
34798
+ {
34799
+ "epoch": 0.04,
34800
+ "learning_rate": 0.0004,
34801
+ "loss": 7.6621,
34802
+ "step": 5709
34803
+ },
34804
+ {
34805
+ "epoch": 0.04,
34806
+ "learning_rate": 0.0004,
34807
+ "loss": 7.9571,
34808
+ "step": 5710
34809
+ },
34810
+ {
34811
+ "epoch": 0.04,
34812
+ "learning_rate": 0.0004,
34813
+ "loss": 7.9943,
34814
+ "step": 5711
34815
+ },
34816
+ {
34817
+ "epoch": 0.04,
34818
+ "learning_rate": 0.0004,
34819
+ "loss": 8.6949,
34820
+ "step": 5712
34821
+ },
34822
+ {
34823
+ "epoch": 0.04,
34824
+ "learning_rate": 0.0004,
34825
+ "loss": 7.3717,
34826
+ "step": 5713
34827
+ },
34828
+ {
34829
+ "epoch": 0.04,
34830
+ "learning_rate": 0.0004,
34831
+ "loss": 7.3738,
34832
+ "step": 5714
34833
+ },
34834
+ {
34835
+ "epoch": 0.04,
34836
+ "learning_rate": 0.0004,
34837
+ "loss": 6.5416,
34838
+ "step": 5715
34839
+ },
34840
+ {
34841
+ "epoch": 0.04,
34842
+ "learning_rate": 0.0004,
34843
+ "loss": 3.6103,
34844
+ "step": 5716
34845
+ },
34846
+ {
34847
+ "epoch": 0.04,
34848
+ "learning_rate": 0.0004,
34849
+ "loss": 6.9328,
34850
+ "step": 5717
34851
+ },
34852
+ {
34853
+ "epoch": 0.04,
34854
+ "learning_rate": 0.0004,
34855
+ "loss": 7.5956,
34856
+ "step": 5718
34857
+ },
34858
+ {
34859
+ "epoch": 0.04,
34860
+ "learning_rate": 0.0004,
34861
+ "loss": 6.846,
34862
+ "step": 5719
34863
+ },
34864
+ {
34865
+ "epoch": 0.04,
34866
+ "learning_rate": 0.0004,
34867
+ "loss": 8.6016,
34868
+ "step": 5720
34869
+ },
34870
+ {
34871
+ "epoch": 0.04,
34872
+ "learning_rate": 0.0004,
34873
+ "loss": 7.1171,
34874
+ "step": 5721
34875
+ },
34876
+ {
34877
+ "epoch": 0.04,
34878
+ "learning_rate": 0.0004,
34879
+ "loss": 5.5251,
34880
+ "step": 5722
34881
+ },
34882
+ {
34883
+ "epoch": 0.04,
34884
+ "learning_rate": 0.0004,
34885
+ "loss": 6.3209,
34886
+ "step": 5723
34887
+ },
34888
+ {
34889
+ "epoch": 0.04,
34890
+ "learning_rate": 0.0004,
34891
+ "loss": 3.9372,
34892
+ "step": 5724
34893
+ },
34894
+ {
34895
+ "epoch": 0.04,
34896
+ "learning_rate": 0.0004,
34897
+ "loss": 5.4344,
34898
+ "step": 5725
34899
+ },
34900
+ {
34901
+ "epoch": 0.04,
34902
+ "learning_rate": 0.0004,
34903
+ "loss": 3.4504,
34904
+ "step": 5726
34905
+ },
34906
+ {
34907
+ "epoch": 0.04,
34908
+ "learning_rate": 0.0004,
34909
+ "loss": 3.0255,
34910
+ "step": 5727
34911
+ },
34912
+ {
34913
+ "epoch": 0.04,
34914
+ "learning_rate": 0.0004,
34915
+ "loss": 5.6402,
34916
+ "step": 5728
34917
+ },
34918
+ {
34919
+ "epoch": 0.04,
34920
+ "learning_rate": 0.0004,
34921
+ "loss": 5.542,
34922
+ "step": 5729
34923
+ },
34924
+ {
34925
+ "epoch": 0.04,
34926
+ "learning_rate": 0.0004,
34927
+ "loss": 5.1837,
34928
+ "step": 5730
34929
+ },
34930
+ {
34931
+ "epoch": 0.04,
34932
+ "learning_rate": 0.0004,
34933
+ "loss": 3.796,
34934
+ "step": 5731
34935
+ },
34936
+ {
34937
+ "epoch": 0.04,
34938
+ "learning_rate": 0.0004,
34939
+ "loss": 5.8129,
34940
+ "step": 5732
34941
+ },
34942
+ {
34943
+ "epoch": 0.04,
34944
+ "learning_rate": 0.0004,
34945
+ "loss": 7.0823,
34946
+ "step": 5733
34947
+ },
34948
+ {
34949
+ "epoch": 0.04,
34950
+ "learning_rate": 0.0004,
34951
+ "loss": 6.2968,
34952
+ "step": 5734
34953
+ },
34954
+ {
34955
+ "epoch": 0.04,
34956
+ "learning_rate": 0.0004,
34957
+ "loss": 6.0133,
34958
+ "step": 5735
34959
+ },
34960
+ {
34961
+ "epoch": 0.04,
34962
+ "learning_rate": 0.0004,
34963
+ "loss": 6.5933,
34964
+ "step": 5736
34965
+ },
34966
+ {
34967
+ "epoch": 0.04,
34968
+ "learning_rate": 0.0004,
34969
+ "loss": 6.9654,
34970
+ "step": 5737
34971
+ },
34972
+ {
34973
+ "epoch": 0.04,
34974
+ "learning_rate": 0.0004,
34975
+ "loss": 3.2416,
34976
+ "step": 5738
34977
+ },
34978
+ {
34979
+ "epoch": 0.04,
34980
+ "learning_rate": 0.0004,
34981
+ "loss": 6.8676,
34982
+ "step": 5739
34983
+ },
34984
+ {
34985
+ "epoch": 0.04,
34986
+ "learning_rate": 0.0004,
34987
+ "loss": 8.6902,
34988
+ "step": 5740
34989
+ },
34990
+ {
34991
+ "epoch": 0.04,
34992
+ "learning_rate": 0.0004,
34993
+ "loss": 4.9099,
34994
+ "step": 5741
34995
+ },
34996
+ {
34997
+ "epoch": 0.04,
34998
+ "learning_rate": 0.0004,
34999
+ "loss": 5.0585,
35000
+ "step": 5742
35001
+ },
35002
+ {
35003
+ "epoch": 0.04,
35004
+ "learning_rate": 0.0004,
35005
+ "loss": 7.2383,
35006
+ "step": 5743
35007
+ },
35008
+ {
35009
+ "epoch": 0.04,
35010
+ "learning_rate": 0.0004,
35011
+ "loss": 5.7071,
35012
+ "step": 5744
35013
+ },
35014
+ {
35015
+ "epoch": 0.04,
35016
+ "learning_rate": 0.0004,
35017
+ "loss": 4.0832,
35018
+ "step": 5745
35019
+ },
35020
+ {
35021
+ "epoch": 0.04,
35022
+ "learning_rate": 0.0004,
35023
+ "loss": 6.3828,
35024
+ "step": 5746
35025
+ },
35026
+ {
35027
+ "epoch": 0.04,
35028
+ "learning_rate": 0.0004,
35029
+ "loss": 6.8322,
35030
+ "step": 5747
35031
+ },
35032
+ {
35033
+ "epoch": 0.04,
35034
+ "learning_rate": 0.0004,
35035
+ "loss": 2.7194,
35036
+ "step": 5748
35037
+ },
35038
+ {
35039
+ "epoch": 0.04,
35040
+ "learning_rate": 0.0004,
35041
+ "loss": 3.3352,
35042
+ "step": 5749
35043
+ },
35044
+ {
35045
+ "epoch": 0.04,
35046
+ "learning_rate": 0.0004,
35047
+ "loss": 3.3438,
35048
+ "step": 5750
35049
+ },
35050
+ {
35051
+ "epoch": 0.04,
35052
+ "learning_rate": 0.0004,
35053
+ "loss": 8.5028,
35054
+ "step": 5751
35055
+ },
35056
+ {
35057
+ "epoch": 0.04,
35058
+ "learning_rate": 0.0004,
35059
+ "loss": 9.2395,
35060
+ "step": 5752
35061
+ },
35062
+ {
35063
+ "epoch": 0.04,
35064
+ "learning_rate": 0.0004,
35065
+ "loss": 8.8966,
35066
+ "step": 5753
35067
+ },
35068
+ {
35069
+ "epoch": 0.04,
35070
+ "learning_rate": 0.0004,
35071
+ "loss": 5.8439,
35072
+ "step": 5754
35073
+ },
35074
+ {
35075
+ "epoch": 0.04,
35076
+ "learning_rate": 0.0004,
35077
+ "loss": 8.026,
35078
+ "step": 5755
35079
+ },
35080
+ {
35081
+ "epoch": 0.04,
35082
+ "learning_rate": 0.0004,
35083
+ "loss": 3.7253,
35084
+ "step": 5756
35085
+ },
35086
+ {
35087
+ "epoch": 0.04,
35088
+ "learning_rate": 0.0004,
35089
+ "loss": 2.6592,
35090
+ "step": 5757
35091
+ },
35092
+ {
35093
+ "epoch": 0.04,
35094
+ "learning_rate": 0.0004,
35095
+ "loss": 4.6777,
35096
+ "step": 5758
35097
+ },
35098
+ {
35099
+ "epoch": 0.04,
35100
+ "learning_rate": 0.0004,
35101
+ "loss": 6.7246,
35102
+ "step": 5759
35103
+ },
35104
+ {
35105
+ "epoch": 0.04,
35106
+ "learning_rate": 0.0004,
35107
+ "loss": 6.5969,
35108
+ "step": 5760
35109
+ },
35110
+ {
35111
+ "epoch": 0.04,
35112
+ "learning_rate": 0.0004,
35113
+ "loss": 7.5921,
35114
+ "step": 5761
35115
+ },
35116
+ {
35117
+ "epoch": 0.04,
35118
+ "learning_rate": 0.0004,
35119
+ "loss": 6.2614,
35120
+ "step": 5762
35121
+ },
35122
+ {
35123
+ "epoch": 0.04,
35124
+ "learning_rate": 0.0004,
35125
+ "loss": 8.1911,
35126
+ "step": 5763
35127
+ },
35128
+ {
35129
+ "epoch": 0.04,
35130
+ "learning_rate": 0.0004,
35131
+ "loss": 3.013,
35132
+ "step": 5764
35133
+ },
35134
+ {
35135
+ "epoch": 0.04,
35136
+ "learning_rate": 0.0004,
35137
+ "loss": 5.5307,
35138
+ "step": 5765
35139
+ },
35140
+ {
35141
+ "epoch": 0.04,
35142
+ "learning_rate": 0.0004,
35143
+ "loss": 3.5039,
35144
+ "step": 5766
35145
+ },
35146
+ {
35147
+ "epoch": 0.04,
35148
+ "learning_rate": 0.0004,
35149
+ "loss": 7.7382,
35150
+ "step": 5767
35151
+ },
35152
+ {
35153
+ "epoch": 0.04,
35154
+ "learning_rate": 0.0004,
35155
+ "loss": 6.9728,
35156
+ "step": 5768
35157
+ },
35158
+ {
35159
+ "epoch": 0.04,
35160
+ "learning_rate": 0.0004,
35161
+ "loss": 3.8039,
35162
+ "step": 5769
35163
+ },
35164
+ {
35165
+ "epoch": 0.04,
35166
+ "learning_rate": 0.0004,
35167
+ "loss": 2.5774,
35168
+ "step": 5770
35169
+ },
35170
+ {
35171
+ "epoch": 0.04,
35172
+ "learning_rate": 0.0004,
35173
+ "loss": 6.3394,
35174
+ "step": 5771
35175
+ },
35176
+ {
35177
+ "epoch": 0.04,
35178
+ "learning_rate": 0.0004,
35179
+ "loss": 6.6831,
35180
+ "step": 5772
35181
+ },
35182
+ {
35183
+ "epoch": 0.04,
35184
+ "learning_rate": 0.0004,
35185
+ "loss": 3.1592,
35186
+ "step": 5773
35187
+ },
35188
+ {
35189
+ "epoch": 0.04,
35190
+ "learning_rate": 0.0004,
35191
+ "loss": 3.0903,
35192
+ "step": 5774
35193
+ },
35194
+ {
35195
+ "epoch": 0.04,
35196
+ "learning_rate": 0.0004,
35197
+ "loss": 5.0717,
35198
+ "step": 5775
35199
+ },
35200
+ {
35201
+ "epoch": 0.04,
35202
+ "learning_rate": 0.0004,
35203
+ "loss": 5.9321,
35204
+ "step": 5776
35205
+ },
35206
+ {
35207
+ "epoch": 0.04,
35208
+ "learning_rate": 0.0004,
35209
+ "loss": 6.8956,
35210
+ "step": 5777
35211
+ },
35212
+ {
35213
+ "epoch": 0.04,
35214
+ "learning_rate": 0.0004,
35215
+ "loss": 6.0156,
35216
+ "step": 5778
35217
+ },
35218
+ {
35219
+ "epoch": 0.04,
35220
+ "learning_rate": 0.0004,
35221
+ "loss": 10.4466,
35222
+ "step": 5779
35223
+ },
35224
+ {
35225
+ "epoch": 0.04,
35226
+ "learning_rate": 0.0004,
35227
+ "loss": 6.7845,
35228
+ "step": 5780
35229
+ },
35230
+ {
35231
+ "epoch": 0.04,
35232
+ "learning_rate": 0.0004,
35233
+ "loss": 6.6201,
35234
+ "step": 5781
35235
+ },
35236
+ {
35237
+ "epoch": 0.04,
35238
+ "learning_rate": 0.0004,
35239
+ "loss": 8.0356,
35240
+ "step": 5782
35241
+ },
35242
+ {
35243
+ "epoch": 0.04,
35244
+ "learning_rate": 0.0004,
35245
+ "loss": 3.6344,
35246
+ "step": 5783
35247
+ },
35248
+ {
35249
+ "epoch": 0.04,
35250
+ "learning_rate": 0.0004,
35251
+ "loss": 1.9238,
35252
+ "step": 5784
35253
+ },
35254
+ {
35255
+ "epoch": 0.04,
35256
+ "learning_rate": 0.0004,
35257
+ "loss": 3.1729,
35258
+ "step": 5785
35259
+ },
35260
+ {
35261
+ "epoch": 0.04,
35262
+ "learning_rate": 0.0004,
35263
+ "loss": 3.2512,
35264
+ "step": 5786
35265
+ },
35266
+ {
35267
+ "epoch": 0.04,
35268
+ "learning_rate": 0.0004,
35269
+ "loss": 5.6445,
35270
+ "step": 5787
35271
+ },
35272
+ {
35273
+ "epoch": 0.04,
35274
+ "learning_rate": 0.0004,
35275
+ "loss": 8.5213,
35276
+ "step": 5788
35277
+ },
35278
+ {
35279
+ "epoch": 0.04,
35280
+ "learning_rate": 0.0004,
35281
+ "loss": 7.6667,
35282
+ "step": 5789
35283
+ },
35284
+ {
35285
+ "epoch": 0.04,
35286
+ "learning_rate": 0.0004,
35287
+ "loss": 6.6139,
35288
+ "step": 5790
35289
+ },
35290
+ {
35291
+ "epoch": 0.04,
35292
+ "learning_rate": 0.0004,
35293
+ "loss": 7.3236,
35294
+ "step": 5791
35295
+ },
35296
+ {
35297
+ "epoch": 0.04,
35298
+ "learning_rate": 0.0004,
35299
+ "loss": 5.4503,
35300
+ "step": 5792
35301
+ },
35302
+ {
35303
+ "epoch": 0.04,
35304
+ "learning_rate": 0.0004,
35305
+ "loss": 5.5111,
35306
+ "step": 5793
35307
+ },
35308
+ {
35309
+ "epoch": 0.04,
35310
+ "learning_rate": 0.0004,
35311
+ "loss": 5.6659,
35312
+ "step": 5794
35313
+ },
35314
+ {
35315
+ "epoch": 0.04,
35316
+ "learning_rate": 0.0004,
35317
+ "loss": 6.4502,
35318
+ "step": 5795
35319
+ },
35320
+ {
35321
+ "epoch": 0.04,
35322
+ "learning_rate": 0.0004,
35323
+ "loss": 7.0923,
35324
+ "step": 5796
35325
+ },
35326
+ {
35327
+ "epoch": 0.04,
35328
+ "learning_rate": 0.0004,
35329
+ "loss": 7.4155,
35330
+ "step": 5797
35331
+ },
35332
+ {
35333
+ "epoch": 0.04,
35334
+ "learning_rate": 0.0004,
35335
+ "loss": 6.3765,
35336
+ "step": 5798
35337
+ },
35338
+ {
35339
+ "epoch": 0.04,
35340
+ "learning_rate": 0.0004,
35341
+ "loss": 6.0413,
35342
+ "step": 5799
35343
+ },
35344
+ {
35345
+ "epoch": 0.04,
35346
+ "learning_rate": 0.0004,
35347
+ "loss": 4.103,
35348
+ "step": 5800
35349
+ },
35350
+ {
35351
+ "epoch": 0.04,
35352
+ "eval_loss": 6.396474838256836,
35353
+ "eval_runtime": 22.3993,
35354
+ "eval_samples_per_second": 2.232,
35355
+ "eval_steps_per_second": 1.116,
35356
+ "step": 5800
35357
+ },
35358
+ {
35359
+ "epoch": 0.04,
35360
+ "mmlu_eval_accuracy": 0.2525477994227994,
35361
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
35362
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
35363
+ "mmlu_eval_accuracy_astronomy": 0.3125,
35364
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
35365
+ "mmlu_loss": 3.9258560848236086,
35366
+ "step": 5800
35367
+ },
35368
+ {
35369
+ "epoch": 0.04,
35370
+ "learning_rate": 0.0004,
35371
+ "loss": 6.7963,
35372
+ "step": 5801
35373
+ },
35374
+ {
35375
+ "epoch": 0.04,
35376
+ "learning_rate": 0.0004,
35377
+ "loss": 7.8511,
35378
+ "step": 5802
35379
+ },
35380
+ {
35381
+ "epoch": 0.04,
35382
+ "learning_rate": 0.0004,
35383
+ "loss": 8.0362,
35384
+ "step": 5803
35385
+ },
35386
+ {
35387
+ "epoch": 0.04,
35388
+ "learning_rate": 0.0004,
35389
+ "loss": 7.8104,
35390
+ "step": 5804
35391
+ },
35392
+ {
35393
+ "epoch": 0.04,
35394
+ "learning_rate": 0.0004,
35395
+ "loss": 3.5734,
35396
+ "step": 5805
35397
+ },
35398
+ {
35399
+ "epoch": 0.04,
35400
+ "learning_rate": 0.0004,
35401
+ "loss": 7.0506,
35402
+ "step": 5806
35403
+ },
35404
+ {
35405
+ "epoch": 0.04,
35406
+ "learning_rate": 0.0004,
35407
+ "loss": 7.656,
35408
+ "step": 5807
35409
+ },
35410
+ {
35411
+ "epoch": 0.04,
35412
+ "learning_rate": 0.0004,
35413
+ "loss": 6.3932,
35414
+ "step": 5808
35415
+ },
35416
+ {
35417
+ "epoch": 0.04,
35418
+ "learning_rate": 0.0004,
35419
+ "loss": 3.8245,
35420
+ "step": 5809
35421
+ },
35422
+ {
35423
+ "epoch": 0.04,
35424
+ "learning_rate": 0.0004,
35425
+ "loss": 7.633,
35426
+ "step": 5810
35427
+ },
35428
+ {
35429
+ "epoch": 0.04,
35430
+ "learning_rate": 0.0004,
35431
+ "loss": 8.2002,
35432
+ "step": 5811
35433
+ },
35434
+ {
35435
+ "epoch": 0.04,
35436
+ "learning_rate": 0.0004,
35437
+ "loss": 7.415,
35438
+ "step": 5812
35439
+ },
35440
+ {
35441
+ "epoch": 0.04,
35442
+ "learning_rate": 0.0004,
35443
+ "loss": 8.3959,
35444
+ "step": 5813
35445
+ },
35446
+ {
35447
+ "epoch": 0.04,
35448
+ "learning_rate": 0.0004,
35449
+ "loss": 5.0557,
35450
+ "step": 5814
35451
+ },
35452
+ {
35453
+ "epoch": 0.04,
35454
+ "learning_rate": 0.0004,
35455
+ "loss": 5.5936,
35456
+ "step": 5815
35457
+ },
35458
+ {
35459
+ "epoch": 0.04,
35460
+ "learning_rate": 0.0004,
35461
+ "loss": 5.6552,
35462
+ "step": 5816
35463
+ },
35464
+ {
35465
+ "epoch": 0.04,
35466
+ "learning_rate": 0.0004,
35467
+ "loss": 6.4557,
35468
+ "step": 5817
35469
+ },
35470
+ {
35471
+ "epoch": 0.04,
35472
+ "learning_rate": 0.0004,
35473
+ "loss": 3.4525,
35474
+ "step": 5818
35475
+ },
35476
+ {
35477
+ "epoch": 0.04,
35478
+ "learning_rate": 0.0004,
35479
+ "loss": 5.2712,
35480
+ "step": 5819
35481
+ },
35482
+ {
35483
+ "epoch": 0.04,
35484
+ "learning_rate": 0.0004,
35485
+ "loss": 6.5788,
35486
+ "step": 5820
35487
+ },
35488
+ {
35489
+ "epoch": 0.04,
35490
+ "learning_rate": 0.0004,
35491
+ "loss": 3.0075,
35492
+ "step": 5821
35493
+ },
35494
+ {
35495
+ "epoch": 0.04,
35496
+ "learning_rate": 0.0004,
35497
+ "loss": 3.6125,
35498
+ "step": 5822
35499
+ },
35500
+ {
35501
+ "epoch": 0.04,
35502
+ "learning_rate": 0.0004,
35503
+ "loss": 6.3804,
35504
+ "step": 5823
35505
+ },
35506
+ {
35507
+ "epoch": 0.04,
35508
+ "learning_rate": 0.0004,
35509
+ "loss": 6.4267,
35510
+ "step": 5824
35511
+ },
35512
+ {
35513
+ "epoch": 0.04,
35514
+ "learning_rate": 0.0004,
35515
+ "loss": 2.6356,
35516
+ "step": 5825
35517
+ },
35518
+ {
35519
+ "epoch": 0.04,
35520
+ "learning_rate": 0.0004,
35521
+ "loss": 3.2399,
35522
+ "step": 5826
35523
+ },
35524
+ {
35525
+ "epoch": 0.04,
35526
+ "learning_rate": 0.0004,
35527
+ "loss": 3.8583,
35528
+ "step": 5827
35529
+ },
35530
+ {
35531
+ "epoch": 0.04,
35532
+ "learning_rate": 0.0004,
35533
+ "loss": 7.3494,
35534
+ "step": 5828
35535
+ },
35536
+ {
35537
+ "epoch": 0.04,
35538
+ "learning_rate": 0.0004,
35539
+ "loss": 8.0112,
35540
+ "step": 5829
35541
+ },
35542
+ {
35543
+ "epoch": 0.04,
35544
+ "learning_rate": 0.0004,
35545
+ "loss": 8.23,
35546
+ "step": 5830
35547
+ },
35548
+ {
35549
+ "epoch": 0.04,
35550
+ "learning_rate": 0.0004,
35551
+ "loss": 3.9515,
35552
+ "step": 5831
35553
+ },
35554
+ {
35555
+ "epoch": 0.04,
35556
+ "learning_rate": 0.0004,
35557
+ "loss": 2.8647,
35558
+ "step": 5832
35559
+ },
35560
+ {
35561
+ "epoch": 0.04,
35562
+ "learning_rate": 0.0004,
35563
+ "loss": 5.9658,
35564
+ "step": 5833
35565
+ },
35566
+ {
35567
+ "epoch": 0.04,
35568
+ "learning_rate": 0.0004,
35569
+ "loss": 5.105,
35570
+ "step": 5834
35571
+ },
35572
+ {
35573
+ "epoch": 0.04,
35574
+ "learning_rate": 0.0004,
35575
+ "loss": 7.1376,
35576
+ "step": 5835
35577
+ },
35578
+ {
35579
+ "epoch": 0.04,
35580
+ "learning_rate": 0.0004,
35581
+ "loss": 3.6733,
35582
+ "step": 5836
35583
+ },
35584
+ {
35585
+ "epoch": 0.04,
35586
+ "learning_rate": 0.0004,
35587
+ "loss": 6.3152,
35588
+ "step": 5837
35589
+ },
35590
+ {
35591
+ "epoch": 0.04,
35592
+ "learning_rate": 0.0004,
35593
+ "loss": 5.0539,
35594
+ "step": 5838
35595
+ },
35596
+ {
35597
+ "epoch": 0.04,
35598
+ "learning_rate": 0.0004,
35599
+ "loss": 4.3399,
35600
+ "step": 5839
35601
+ },
35602
+ {
35603
+ "epoch": 0.04,
35604
+ "learning_rate": 0.0004,
35605
+ "loss": 6.6373,
35606
+ "step": 5840
35607
+ },
35608
+ {
35609
+ "epoch": 0.04,
35610
+ "learning_rate": 0.0004,
35611
+ "loss": 2.2022,
35612
+ "step": 5841
35613
+ },
35614
+ {
35615
+ "epoch": 0.04,
35616
+ "learning_rate": 0.0004,
35617
+ "loss": 4.2046,
35618
+ "step": 5842
35619
+ },
35620
+ {
35621
+ "epoch": 0.04,
35622
+ "learning_rate": 0.0004,
35623
+ "loss": 7.3559,
35624
+ "step": 5843
35625
+ },
35626
+ {
35627
+ "epoch": 0.04,
35628
+ "learning_rate": 0.0004,
35629
+ "loss": 5.4401,
35630
+ "step": 5844
35631
+ },
35632
+ {
35633
+ "epoch": 0.04,
35634
+ "learning_rate": 0.0004,
35635
+ "loss": 2.1425,
35636
+ "step": 5845
35637
+ },
35638
+ {
35639
+ "epoch": 0.04,
35640
+ "learning_rate": 0.0004,
35641
+ "loss": 8.926,
35642
+ "step": 5846
35643
+ },
35644
+ {
35645
+ "epoch": 0.04,
35646
+ "learning_rate": 0.0004,
35647
+ "loss": 6.7124,
35648
+ "step": 5847
35649
+ },
35650
+ {
35651
+ "epoch": 0.04,
35652
+ "learning_rate": 0.0004,
35653
+ "loss": 4.5364,
35654
+ "step": 5848
35655
+ },
35656
+ {
35657
+ "epoch": 0.04,
35658
+ "learning_rate": 0.0004,
35659
+ "loss": 4.0998,
35660
+ "step": 5849
35661
+ },
35662
+ {
35663
+ "epoch": 0.04,
35664
+ "learning_rate": 0.0004,
35665
+ "loss": 4.5025,
35666
+ "step": 5850
35667
+ },
35668
+ {
35669
+ "epoch": 0.04,
35670
+ "learning_rate": 0.0004,
35671
+ "loss": 3.0973,
35672
+ "step": 5851
35673
+ },
35674
+ {
35675
+ "epoch": 0.04,
35676
+ "learning_rate": 0.0004,
35677
+ "loss": 8.7097,
35678
+ "step": 5852
35679
+ },
35680
+ {
35681
+ "epoch": 0.04,
35682
+ "learning_rate": 0.0004,
35683
+ "loss": 3.1209,
35684
+ "step": 5853
35685
+ },
35686
+ {
35687
+ "epoch": 0.04,
35688
+ "learning_rate": 0.0004,
35689
+ "loss": 7.3284,
35690
+ "step": 5854
35691
+ },
35692
+ {
35693
+ "epoch": 0.04,
35694
+ "learning_rate": 0.0004,
35695
+ "loss": 5.096,
35696
+ "step": 5855
35697
+ },
35698
+ {
35699
+ "epoch": 0.04,
35700
+ "learning_rate": 0.0004,
35701
+ "loss": 5.7432,
35702
+ "step": 5856
35703
+ },
35704
+ {
35705
+ "epoch": 0.04,
35706
+ "learning_rate": 0.0004,
35707
+ "loss": 7.9329,
35708
+ "step": 5857
35709
+ },
35710
+ {
35711
+ "epoch": 0.04,
35712
+ "learning_rate": 0.0004,
35713
+ "loss": 3.5233,
35714
+ "step": 5858
35715
+ },
35716
+ {
35717
+ "epoch": 0.04,
35718
+ "learning_rate": 0.0004,
35719
+ "loss": 2.4872,
35720
+ "step": 5859
35721
+ },
35722
+ {
35723
+ "epoch": 0.04,
35724
+ "learning_rate": 0.0004,
35725
+ "loss": 8.2481,
35726
+ "step": 5860
35727
+ },
35728
+ {
35729
+ "epoch": 0.04,
35730
+ "learning_rate": 0.0004,
35731
+ "loss": 3.1908,
35732
+ "step": 5861
35733
+ },
35734
+ {
35735
+ "epoch": 0.04,
35736
+ "learning_rate": 0.0004,
35737
+ "loss": 7.7033,
35738
+ "step": 5862
35739
+ },
35740
+ {
35741
+ "epoch": 0.04,
35742
+ "learning_rate": 0.0004,
35743
+ "loss": 6.8059,
35744
+ "step": 5863
35745
+ },
35746
+ {
35747
+ "epoch": 0.04,
35748
+ "learning_rate": 0.0004,
35749
+ "loss": 4.1783,
35750
+ "step": 5864
35751
+ },
35752
+ {
35753
+ "epoch": 0.04,
35754
+ "learning_rate": 0.0004,
35755
+ "loss": 2.6015,
35756
+ "step": 5865
35757
+ },
35758
+ {
35759
+ "epoch": 0.04,
35760
+ "learning_rate": 0.0004,
35761
+ "loss": 5.8913,
35762
+ "step": 5866
35763
+ },
35764
+ {
35765
+ "epoch": 0.04,
35766
+ "learning_rate": 0.0004,
35767
+ "loss": 4.0391,
35768
+ "step": 5867
35769
+ },
35770
+ {
35771
+ "epoch": 0.04,
35772
+ "learning_rate": 0.0004,
35773
+ "loss": 6.6047,
35774
+ "step": 5868
35775
+ },
35776
+ {
35777
+ "epoch": 0.04,
35778
+ "learning_rate": 0.0004,
35779
+ "loss": 4.9347,
35780
+ "step": 5869
35781
+ },
35782
+ {
35783
+ "epoch": 0.04,
35784
+ "learning_rate": 0.0004,
35785
+ "loss": 7.2305,
35786
+ "step": 5870
35787
+ },
35788
+ {
35789
+ "epoch": 0.04,
35790
+ "learning_rate": 0.0004,
35791
+ "loss": 5.8909,
35792
+ "step": 5871
35793
+ },
35794
+ {
35795
+ "epoch": 0.04,
35796
+ "learning_rate": 0.0004,
35797
+ "loss": 5.762,
35798
+ "step": 5872
35799
+ },
35800
+ {
35801
+ "epoch": 0.04,
35802
+ "learning_rate": 0.0004,
35803
+ "loss": 7.7704,
35804
+ "step": 5873
35805
+ },
35806
+ {
35807
+ "epoch": 0.04,
35808
+ "learning_rate": 0.0004,
35809
+ "loss": 4.8633,
35810
+ "step": 5874
35811
+ },
35812
+ {
35813
+ "epoch": 0.04,
35814
+ "learning_rate": 0.0004,
35815
+ "loss": 6.6627,
35816
+ "step": 5875
35817
+ },
35818
+ {
35819
+ "epoch": 0.04,
35820
+ "learning_rate": 0.0004,
35821
+ "loss": 7.5499,
35822
+ "step": 5876
35823
+ },
35824
+ {
35825
+ "epoch": 0.04,
35826
+ "learning_rate": 0.0004,
35827
+ "loss": 6.6472,
35828
+ "step": 5877
35829
+ },
35830
+ {
35831
+ "epoch": 0.04,
35832
+ "learning_rate": 0.0004,
35833
+ "loss": 7.4914,
35834
+ "step": 5878
35835
+ },
35836
+ {
35837
+ "epoch": 0.04,
35838
+ "learning_rate": 0.0004,
35839
+ "loss": 3.9684,
35840
+ "step": 5879
35841
+ },
35842
+ {
35843
+ "epoch": 0.04,
35844
+ "learning_rate": 0.0004,
35845
+ "loss": 6.903,
35846
+ "step": 5880
35847
+ },
35848
+ {
35849
+ "epoch": 0.04,
35850
+ "learning_rate": 0.0004,
35851
+ "loss": 6.4157,
35852
+ "step": 5881
35853
+ },
35854
+ {
35855
+ "epoch": 0.04,
35856
+ "learning_rate": 0.0004,
35857
+ "loss": 6.2942,
35858
+ "step": 5882
35859
+ },
35860
+ {
35861
+ "epoch": 0.04,
35862
+ "learning_rate": 0.0004,
35863
+ "loss": 2.6608,
35864
+ "step": 5883
35865
+ },
35866
+ {
35867
+ "epoch": 0.04,
35868
+ "learning_rate": 0.0004,
35869
+ "loss": 5.3432,
35870
+ "step": 5884
35871
+ },
35872
+ {
35873
+ "epoch": 0.04,
35874
+ "learning_rate": 0.0004,
35875
+ "loss": 2.9234,
35876
+ "step": 5885
35877
+ },
35878
+ {
35879
+ "epoch": 0.04,
35880
+ "learning_rate": 0.0004,
35881
+ "loss": 6.6604,
35882
+ "step": 5886
35883
+ },
35884
+ {
35885
+ "epoch": 0.04,
35886
+ "learning_rate": 0.0004,
35887
+ "loss": 2.7875,
35888
+ "step": 5887
35889
+ },
35890
+ {
35891
+ "epoch": 0.04,
35892
+ "learning_rate": 0.0004,
35893
+ "loss": 5.4057,
35894
+ "step": 5888
35895
+ },
35896
+ {
35897
+ "epoch": 0.04,
35898
+ "learning_rate": 0.0004,
35899
+ "loss": 2.352,
35900
+ "step": 5889
35901
+ },
35902
+ {
35903
+ "epoch": 0.04,
35904
+ "learning_rate": 0.0004,
35905
+ "loss": 2.2785,
35906
+ "step": 5890
35907
+ },
35908
+ {
35909
+ "epoch": 0.05,
35910
+ "learning_rate": 0.0004,
35911
+ "loss": 3.4352,
35912
+ "step": 5891
35913
+ },
35914
+ {
35915
+ "epoch": 0.05,
35916
+ "learning_rate": 0.0004,
35917
+ "loss": 5.6623,
35918
+ "step": 5892
35919
+ },
35920
+ {
35921
+ "epoch": 0.05,
35922
+ "learning_rate": 0.0004,
35923
+ "loss": 2.8201,
35924
+ "step": 5893
35925
+ },
35926
+ {
35927
+ "epoch": 0.05,
35928
+ "learning_rate": 0.0004,
35929
+ "loss": 6.2894,
35930
+ "step": 5894
35931
+ },
35932
+ {
35933
+ "epoch": 0.05,
35934
+ "learning_rate": 0.0004,
35935
+ "loss": 7.0568,
35936
+ "step": 5895
35937
+ },
35938
+ {
35939
+ "epoch": 0.05,
35940
+ "learning_rate": 0.0004,
35941
+ "loss": 9.1368,
35942
+ "step": 5896
35943
+ },
35944
+ {
35945
+ "epoch": 0.05,
35946
+ "learning_rate": 0.0004,
35947
+ "loss": 4.4088,
35948
+ "step": 5897
35949
+ },
35950
+ {
35951
+ "epoch": 0.05,
35952
+ "learning_rate": 0.0004,
35953
+ "loss": 6.5719,
35954
+ "step": 5898
35955
+ },
35956
+ {
35957
+ "epoch": 0.05,
35958
+ "learning_rate": 0.0004,
35959
+ "loss": 7.4458,
35960
+ "step": 5899
35961
+ },
35962
+ {
35963
+ "epoch": 0.05,
35964
+ "learning_rate": 0.0004,
35965
+ "loss": 6.4525,
35966
+ "step": 5900
35967
+ },
35968
+ {
35969
+ "epoch": 0.05,
35970
+ "learning_rate": 0.0004,
35971
+ "loss": 8.5376,
35972
+ "step": 5901
35973
+ },
35974
+ {
35975
+ "epoch": 0.05,
35976
+ "learning_rate": 0.0004,
35977
+ "loss": 6.6726,
35978
+ "step": 5902
35979
+ },
35980
+ {
35981
+ "epoch": 0.05,
35982
+ "learning_rate": 0.0004,
35983
+ "loss": 7.8266,
35984
+ "step": 5903
35985
+ },
35986
+ {
35987
+ "epoch": 0.05,
35988
+ "learning_rate": 0.0004,
35989
+ "loss": 6.7965,
35990
+ "step": 5904
35991
+ },
35992
+ {
35993
+ "epoch": 0.05,
35994
+ "learning_rate": 0.0004,
35995
+ "loss": 4.8672,
35996
+ "step": 5905
35997
+ },
35998
+ {
35999
+ "epoch": 0.05,
36000
+ "learning_rate": 0.0004,
36001
+ "loss": 3.2546,
36002
+ "step": 5906
36003
+ },
36004
+ {
36005
+ "epoch": 0.05,
36006
+ "learning_rate": 0.0004,
36007
+ "loss": 7.9688,
36008
+ "step": 5907
36009
+ },
36010
+ {
36011
+ "epoch": 0.05,
36012
+ "learning_rate": 0.0004,
36013
+ "loss": 7.4705,
36014
+ "step": 5908
36015
+ },
36016
+ {
36017
+ "epoch": 0.05,
36018
+ "learning_rate": 0.0004,
36019
+ "loss": 3.4179,
36020
+ "step": 5909
36021
+ },
36022
+ {
36023
+ "epoch": 0.05,
36024
+ "learning_rate": 0.0004,
36025
+ "loss": 7.4204,
36026
+ "step": 5910
36027
+ },
36028
+ {
36029
+ "epoch": 0.05,
36030
+ "learning_rate": 0.0004,
36031
+ "loss": 7.7445,
36032
+ "step": 5911
36033
+ },
36034
+ {
36035
+ "epoch": 0.05,
36036
+ "learning_rate": 0.0004,
36037
+ "loss": 6.9589,
36038
+ "step": 5912
36039
+ },
36040
+ {
36041
+ "epoch": 0.05,
36042
+ "learning_rate": 0.0004,
36043
+ "loss": 7.5771,
36044
+ "step": 5913
36045
+ },
36046
+ {
36047
+ "epoch": 0.05,
36048
+ "learning_rate": 0.0004,
36049
+ "loss": 9.1289,
36050
+ "step": 5914
36051
+ },
36052
+ {
36053
+ "epoch": 0.05,
36054
+ "learning_rate": 0.0004,
36055
+ "loss": 4.0564,
36056
+ "step": 5915
36057
+ },
36058
+ {
36059
+ "epoch": 0.05,
36060
+ "learning_rate": 0.0004,
36061
+ "loss": 7.5045,
36062
+ "step": 5916
36063
+ },
36064
+ {
36065
+ "epoch": 0.05,
36066
+ "learning_rate": 0.0004,
36067
+ "loss": 6.6193,
36068
+ "step": 5917
36069
+ },
36070
+ {
36071
+ "epoch": 0.05,
36072
+ "learning_rate": 0.0004,
36073
+ "loss": 2.9347,
36074
+ "step": 5918
36075
+ },
36076
+ {
36077
+ "epoch": 0.05,
36078
+ "learning_rate": 0.0004,
36079
+ "loss": 7.276,
36080
+ "step": 5919
36081
+ },
36082
+ {
36083
+ "epoch": 0.05,
36084
+ "learning_rate": 0.0004,
36085
+ "loss": 6.0243,
36086
+ "step": 5920
36087
+ },
36088
+ {
36089
+ "epoch": 0.05,
36090
+ "learning_rate": 0.0004,
36091
+ "loss": 8.8889,
36092
+ "step": 5921
36093
+ },
36094
+ {
36095
+ "epoch": 0.05,
36096
+ "learning_rate": 0.0004,
36097
+ "loss": 4.8016,
36098
+ "step": 5922
36099
+ },
36100
+ {
36101
+ "epoch": 0.05,
36102
+ "learning_rate": 0.0004,
36103
+ "loss": 7.6244,
36104
+ "step": 5923
36105
+ },
36106
+ {
36107
+ "epoch": 0.05,
36108
+ "learning_rate": 0.0004,
36109
+ "loss": 4.6548,
36110
+ "step": 5924
36111
+ },
36112
+ {
36113
+ "epoch": 0.05,
36114
+ "learning_rate": 0.0004,
36115
+ "loss": 5.446,
36116
+ "step": 5925
36117
+ },
36118
+ {
36119
+ "epoch": 0.05,
36120
+ "learning_rate": 0.0004,
36121
+ "loss": 3.0701,
36122
+ "step": 5926
36123
+ },
36124
+ {
36125
+ "epoch": 0.05,
36126
+ "learning_rate": 0.0004,
36127
+ "loss": 3.6489,
36128
+ "step": 5927
36129
+ },
36130
+ {
36131
+ "epoch": 0.05,
36132
+ "learning_rate": 0.0004,
36133
+ "loss": 6.8636,
36134
+ "step": 5928
36135
+ },
36136
+ {
36137
+ "epoch": 0.05,
36138
+ "learning_rate": 0.0004,
36139
+ "loss": 7.3796,
36140
+ "step": 5929
36141
+ },
36142
+ {
36143
+ "epoch": 0.05,
36144
+ "learning_rate": 0.0004,
36145
+ "loss": 6.3366,
36146
+ "step": 5930
36147
+ },
36148
+ {
36149
+ "epoch": 0.05,
36150
+ "learning_rate": 0.0004,
36151
+ "loss": 7.4844,
36152
+ "step": 5931
36153
+ },
36154
+ {
36155
+ "epoch": 0.05,
36156
+ "learning_rate": 0.0004,
36157
+ "loss": 5.5549,
36158
+ "step": 5932
36159
+ },
36160
+ {
36161
+ "epoch": 0.05,
36162
+ "learning_rate": 0.0004,
36163
+ "loss": 7.976,
36164
+ "step": 5933
36165
+ },
36166
+ {
36167
+ "epoch": 0.05,
36168
+ "learning_rate": 0.0004,
36169
+ "loss": 7.0844,
36170
+ "step": 5934
36171
+ },
36172
+ {
36173
+ "epoch": 0.05,
36174
+ "learning_rate": 0.0004,
36175
+ "loss": 3.5849,
36176
+ "step": 5935
36177
+ },
36178
+ {
36179
+ "epoch": 0.05,
36180
+ "learning_rate": 0.0004,
36181
+ "loss": 6.5648,
36182
+ "step": 5936
36183
+ },
36184
+ {
36185
+ "epoch": 0.05,
36186
+ "learning_rate": 0.0004,
36187
+ "loss": 6.8267,
36188
+ "step": 5937
36189
+ },
36190
+ {
36191
+ "epoch": 0.05,
36192
+ "learning_rate": 0.0004,
36193
+ "loss": 4.0671,
36194
+ "step": 5938
36195
+ },
36196
+ {
36197
+ "epoch": 0.05,
36198
+ "learning_rate": 0.0004,
36199
+ "loss": 5.9199,
36200
+ "step": 5939
36201
+ },
36202
+ {
36203
+ "epoch": 0.05,
36204
+ "learning_rate": 0.0004,
36205
+ "loss": 6.7518,
36206
+ "step": 5940
36207
+ },
36208
+ {
36209
+ "epoch": 0.05,
36210
+ "learning_rate": 0.0004,
36211
+ "loss": 2.9931,
36212
+ "step": 5941
36213
+ },
36214
+ {
36215
+ "epoch": 0.05,
36216
+ "learning_rate": 0.0004,
36217
+ "loss": 4.1515,
36218
+ "step": 5942
36219
+ },
36220
+ {
36221
+ "epoch": 0.05,
36222
+ "learning_rate": 0.0004,
36223
+ "loss": 5.4225,
36224
+ "step": 5943
36225
+ },
36226
+ {
36227
+ "epoch": 0.05,
36228
+ "learning_rate": 0.0004,
36229
+ "loss": 4.7662,
36230
+ "step": 5944
36231
+ },
36232
+ {
36233
+ "epoch": 0.05,
36234
+ "learning_rate": 0.0004,
36235
+ "loss": 4.7916,
36236
+ "step": 5945
36237
+ },
36238
+ {
36239
+ "epoch": 0.05,
36240
+ "learning_rate": 0.0004,
36241
+ "loss": 5.6711,
36242
+ "step": 5946
36243
+ },
36244
+ {
36245
+ "epoch": 0.05,
36246
+ "learning_rate": 0.0004,
36247
+ "loss": 6.4338,
36248
+ "step": 5947
36249
+ },
36250
+ {
36251
+ "epoch": 0.05,
36252
+ "learning_rate": 0.0004,
36253
+ "loss": 6.1612,
36254
+ "step": 5948
36255
+ },
36256
+ {
36257
+ "epoch": 0.05,
36258
+ "learning_rate": 0.0004,
36259
+ "loss": 4.3135,
36260
+ "step": 5949
36261
+ },
36262
+ {
36263
+ "epoch": 0.05,
36264
+ "learning_rate": 0.0004,
36265
+ "loss": 5.6296,
36266
+ "step": 5950
36267
+ },
36268
+ {
36269
+ "epoch": 0.05,
36270
+ "learning_rate": 0.0004,
36271
+ "loss": 8.2795,
36272
+ "step": 5951
36273
+ },
36274
+ {
36275
+ "epoch": 0.05,
36276
+ "learning_rate": 0.0004,
36277
+ "loss": 7.2667,
36278
+ "step": 5952
36279
+ },
36280
+ {
36281
+ "epoch": 0.05,
36282
+ "learning_rate": 0.0004,
36283
+ "loss": 4.4897,
36284
+ "step": 5953
36285
+ },
36286
+ {
36287
+ "epoch": 0.05,
36288
+ "learning_rate": 0.0004,
36289
+ "loss": 3.9241,
36290
+ "step": 5954
36291
+ },
36292
+ {
36293
+ "epoch": 0.05,
36294
+ "learning_rate": 0.0004,
36295
+ "loss": 7.776,
36296
+ "step": 5955
36297
+ },
36298
+ {
36299
+ "epoch": 0.05,
36300
+ "learning_rate": 0.0004,
36301
+ "loss": 7.3649,
36302
+ "step": 5956
36303
+ },
36304
+ {
36305
+ "epoch": 0.05,
36306
+ "learning_rate": 0.0004,
36307
+ "loss": 2.6375,
36308
+ "step": 5957
36309
+ },
36310
+ {
36311
+ "epoch": 0.05,
36312
+ "learning_rate": 0.0004,
36313
+ "loss": 9.0611,
36314
+ "step": 5958
36315
+ },
36316
+ {
36317
+ "epoch": 0.05,
36318
+ "learning_rate": 0.0004,
36319
+ "loss": 6.7652,
36320
+ "step": 5959
36321
+ },
36322
+ {
36323
+ "epoch": 0.05,
36324
+ "learning_rate": 0.0004,
36325
+ "loss": 8.7396,
36326
+ "step": 5960
36327
+ },
36328
+ {
36329
+ "epoch": 0.05,
36330
+ "learning_rate": 0.0004,
36331
+ "loss": 7.8184,
36332
+ "step": 5961
36333
+ },
36334
+ {
36335
+ "epoch": 0.05,
36336
+ "learning_rate": 0.0004,
36337
+ "loss": 6.9717,
36338
+ "step": 5962
36339
+ },
36340
+ {
36341
+ "epoch": 0.05,
36342
+ "learning_rate": 0.0004,
36343
+ "loss": 6.7367,
36344
+ "step": 5963
36345
+ },
36346
+ {
36347
+ "epoch": 0.05,
36348
+ "learning_rate": 0.0004,
36349
+ "loss": 5.3137,
36350
+ "step": 5964
36351
+ },
36352
+ {
36353
+ "epoch": 0.05,
36354
+ "learning_rate": 0.0004,
36355
+ "loss": 7.5619,
36356
+ "step": 5965
36357
+ },
36358
+ {
36359
+ "epoch": 0.05,
36360
+ "learning_rate": 0.0004,
36361
+ "loss": 5.5172,
36362
+ "step": 5966
36363
+ },
36364
+ {
36365
+ "epoch": 0.05,
36366
+ "learning_rate": 0.0004,
36367
+ "loss": 7.5568,
36368
+ "step": 5967
36369
+ },
36370
+ {
36371
+ "epoch": 0.05,
36372
+ "learning_rate": 0.0004,
36373
+ "loss": 8.1321,
36374
+ "step": 5968
36375
+ },
36376
+ {
36377
+ "epoch": 0.05,
36378
+ "learning_rate": 0.0004,
36379
+ "loss": 8.8486,
36380
+ "step": 5969
36381
+ },
36382
+ {
36383
+ "epoch": 0.05,
36384
+ "learning_rate": 0.0004,
36385
+ "loss": 3.6196,
36386
+ "step": 5970
36387
+ },
36388
+ {
36389
+ "epoch": 0.05,
36390
+ "learning_rate": 0.0004,
36391
+ "loss": 7.7649,
36392
+ "step": 5971
36393
+ },
36394
+ {
36395
+ "epoch": 0.05,
36396
+ "learning_rate": 0.0004,
36397
+ "loss": 8.096,
36398
+ "step": 5972
36399
+ },
36400
+ {
36401
+ "epoch": 0.05,
36402
+ "learning_rate": 0.0004,
36403
+ "loss": 3.2377,
36404
+ "step": 5973
36405
+ },
36406
+ {
36407
+ "epoch": 0.05,
36408
+ "learning_rate": 0.0004,
36409
+ "loss": 7.9327,
36410
+ "step": 5974
36411
+ },
36412
+ {
36413
+ "epoch": 0.05,
36414
+ "learning_rate": 0.0004,
36415
+ "loss": 3.0676,
36416
+ "step": 5975
36417
+ },
36418
+ {
36419
+ "epoch": 0.05,
36420
+ "learning_rate": 0.0004,
36421
+ "loss": 6.9014,
36422
+ "step": 5976
36423
+ },
36424
+ {
36425
+ "epoch": 0.05,
36426
+ "learning_rate": 0.0004,
36427
+ "loss": 7.9241,
36428
+ "step": 5977
36429
+ },
36430
+ {
36431
+ "epoch": 0.05,
36432
+ "learning_rate": 0.0004,
36433
+ "loss": 12.1662,
36434
+ "step": 5978
36435
+ },
36436
+ {
36437
+ "epoch": 0.05,
36438
+ "learning_rate": 0.0004,
36439
+ "loss": 2.9906,
36440
+ "step": 5979
36441
+ },
36442
+ {
36443
+ "epoch": 0.05,
36444
+ "learning_rate": 0.0004,
36445
+ "loss": 4.6138,
36446
+ "step": 5980
36447
+ },
36448
+ {
36449
+ "epoch": 0.05,
36450
+ "learning_rate": 0.0004,
36451
+ "loss": 2.8328,
36452
+ "step": 5981
36453
+ },
36454
+ {
36455
+ "epoch": 0.05,
36456
+ "learning_rate": 0.0004,
36457
+ "loss": 2.6569,
36458
+ "step": 5982
36459
+ },
36460
+ {
36461
+ "epoch": 0.05,
36462
+ "learning_rate": 0.0004,
36463
+ "loss": 6.6642,
36464
+ "step": 5983
36465
+ },
36466
+ {
36467
+ "epoch": 0.05,
36468
+ "learning_rate": 0.0004,
36469
+ "loss": 4.8701,
36470
+ "step": 5984
36471
+ },
36472
+ {
36473
+ "epoch": 0.05,
36474
+ "learning_rate": 0.0004,
36475
+ "loss": 2.4972,
36476
+ "step": 5985
36477
+ },
36478
+ {
36479
+ "epoch": 0.05,
36480
+ "learning_rate": 0.0004,
36481
+ "loss": 3.1518,
36482
+ "step": 5986
36483
+ },
36484
+ {
36485
+ "epoch": 0.05,
36486
+ "learning_rate": 0.0004,
36487
+ "loss": 7.1437,
36488
+ "step": 5987
36489
+ },
36490
+ {
36491
+ "epoch": 0.05,
36492
+ "learning_rate": 0.0004,
36493
+ "loss": 6.2173,
36494
+ "step": 5988
36495
+ },
36496
+ {
36497
+ "epoch": 0.05,
36498
+ "learning_rate": 0.0004,
36499
+ "loss": 6.7305,
36500
+ "step": 5989
36501
+ },
36502
+ {
36503
+ "epoch": 0.05,
36504
+ "learning_rate": 0.0004,
36505
+ "loss": 7.6896,
36506
+ "step": 5990
36507
+ },
36508
+ {
36509
+ "epoch": 0.05,
36510
+ "learning_rate": 0.0004,
36511
+ "loss": 7.5627,
36512
+ "step": 5991
36513
+ },
36514
+ {
36515
+ "epoch": 0.05,
36516
+ "learning_rate": 0.0004,
36517
+ "loss": 4.5204,
36518
+ "step": 5992
36519
+ },
36520
+ {
36521
+ "epoch": 0.05,
36522
+ "learning_rate": 0.0004,
36523
+ "loss": 5.9454,
36524
+ "step": 5993
36525
+ },
36526
+ {
36527
+ "epoch": 0.05,
36528
+ "learning_rate": 0.0004,
36529
+ "loss": 6.4362,
36530
+ "step": 5994
36531
+ },
36532
+ {
36533
+ "epoch": 0.05,
36534
+ "learning_rate": 0.0004,
36535
+ "loss": 4.0131,
36536
+ "step": 5995
36537
+ },
36538
+ {
36539
+ "epoch": 0.05,
36540
+ "learning_rate": 0.0004,
36541
+ "loss": 6.1399,
36542
+ "step": 5996
36543
+ },
36544
+ {
36545
+ "epoch": 0.05,
36546
+ "learning_rate": 0.0004,
36547
+ "loss": 7.666,
36548
+ "step": 5997
36549
+ },
36550
+ {
36551
+ "epoch": 0.05,
36552
+ "learning_rate": 0.0004,
36553
+ "loss": 8.962,
36554
+ "step": 5998
36555
+ },
36556
+ {
36557
+ "epoch": 0.05,
36558
+ "learning_rate": 0.0004,
36559
+ "loss": 3.4282,
36560
+ "step": 5999
36561
+ },
36562
+ {
36563
+ "epoch": 0.05,
36564
+ "learning_rate": 0.0004,
36565
+ "loss": 3.7265,
36566
+ "step": 6000
36567
+ },
36568
+ {
36569
+ "epoch": 0.05,
36570
+ "eval_loss": 6.473691463470459,
36571
+ "eval_runtime": 22.3658,
36572
+ "eval_samples_per_second": 2.236,
36573
+ "eval_steps_per_second": 1.118,
36574
+ "step": 6000
36575
+ },
36576
+ {
36577
+ "epoch": 0.05,
36578
+ "mmlu_eval_accuracy": 0.2525477994227994,
36579
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
36580
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
36581
+ "mmlu_eval_accuracy_astronomy": 0.3125,
36582
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
36583
+ "mmlu_loss": 3.9286953735351564,
36584
+ "step": 6000
36585
+ },
36586
+ {
36587
+ "epoch": 0.05,
36588
+ "step": 6000,
36589
+ "total_flos": 9.88792958631936e+16,
36590
+ "train_loss": 0.5874443841576577,
36591
+ "train_runtime": 1725.6374,
36592
+ "train_samples_per_second": 17.385,
36593
+ "train_steps_per_second": 17.385
36594
  }
36595
  ],
36596
  "max_steps": 30000,
36597
  "num_train_epochs": 1,
36598
+ "total_flos": 9.88792958631936e+16,
36599
  "trial_name": null,
36600
  "trial_params": null
36601
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe7d931ebfbcece1009124b9eae98d1a465edd703240c0655ee9bb17db395973
3
  size 6011
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85783faab59f5f6d8bcf691e35bb86cff435e22f3fa9169bf4e56c0239c8d7e4
3
  size 6011