Farouk commited on
Commit
fbd1944
·
1 Parent(s): f277292

Training in progress, step 5600

Browse files
adapter_config.json CHANGED
@@ -14,13 +14,13 @@
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
- "v_proj",
 
18
  "down_proj",
19
- "q_proj",
20
  "gate_proj",
21
- "o_proj",
22
  "up_proj",
23
- "k_proj"
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
+ "o_proj",
18
+ "k_proj",
19
  "down_proj",
 
20
  "gate_proj",
 
21
  "up_proj",
22
+ "v_proj",
23
+ "q_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ae21f54a8b0c005a487ec9988c364f042955befb630e69d6d65a1db04ac43b3
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a81912e202e5ef1e0abe63acfe2600dcb02f3768a37b847ef40a41ebbb64f69
3
  size 871609293
all_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 0.04,
3
  "eval_loss": 6.335043907165527,
4
- "eval_runtime": 21.5795,
5
- "eval_samples_per_second": 2.317,
6
- "eval_steps_per_second": 1.159,
7
- "train_loss": 1.2399261393149694,
8
- "train_runtime": 2886.6567,
9
- "train_samples_per_second": 10.393,
10
- "train_steps_per_second": 10.393
11
  }
 
1
  {
2
  "epoch": 0.04,
3
  "eval_loss": 6.335043907165527,
4
+ "eval_runtime": 21.6378,
5
+ "eval_samples_per_second": 2.311,
6
+ "eval_steps_per_second": 1.155,
7
+ "train_loss": 0.6445872698006807,
8
+ "train_runtime": 1748.3273,
9
+ "train_samples_per_second": 17.159,
10
+ "train_steps_per_second": 17.159
11
  }
checkpoint-4200/adapter_model/adapter_model/README.md CHANGED
@@ -70,6 +70,28 @@ The following `bitsandbytes` quantization config was used during training:
70
  - bnb_4bit_use_double_quant: True
71
  - bnb_4bit_compute_dtype: bfloat16
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  The following `bitsandbytes` quantization config was used during training:
74
  - load_in_8bit: False
75
  - load_in_4bit: True
@@ -88,5 +110,7 @@ The following `bitsandbytes` quantization config was used during training:
88
  - PEFT 0.4.0
89
  - PEFT 0.4.0
90
  - PEFT 0.4.0
 
 
91
 
92
  - PEFT 0.4.0
 
70
  - bnb_4bit_use_double_quant: True
71
  - bnb_4bit_compute_dtype: bfloat16
72
 
73
+ The following `bitsandbytes` quantization config was used during training:
74
+ - load_in_8bit: False
75
+ - load_in_4bit: True
76
+ - llm_int8_threshold: 6.0
77
+ - llm_int8_skip_modules: None
78
+ - llm_int8_enable_fp32_cpu_offload: False
79
+ - llm_int8_has_fp16_weight: False
80
+ - bnb_4bit_quant_type: nf4
81
+ - bnb_4bit_use_double_quant: True
82
+ - bnb_4bit_compute_dtype: bfloat16
83
+
84
+ The following `bitsandbytes` quantization config was used during training:
85
+ - load_in_8bit: False
86
+ - load_in_4bit: True
87
+ - llm_int8_threshold: 6.0
88
+ - llm_int8_skip_modules: None
89
+ - llm_int8_enable_fp32_cpu_offload: False
90
+ - llm_int8_has_fp16_weight: False
91
+ - bnb_4bit_quant_type: nf4
92
+ - bnb_4bit_use_double_quant: True
93
+ - bnb_4bit_compute_dtype: bfloat16
94
+
95
  The following `bitsandbytes` quantization config was used during training:
96
  - load_in_8bit: False
97
  - load_in_4bit: True
 
110
  - PEFT 0.4.0
111
  - PEFT 0.4.0
112
  - PEFT 0.4.0
113
+ - PEFT 0.4.0
114
+ - PEFT 0.4.0
115
 
116
  - PEFT 0.4.0
checkpoint-4200/adapter_model/adapter_model/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:924afe9e70391c283830444e2984e95d631b7461608ce334050bf7437f49b4c7
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff18c40f9b3c9fb20f1c95d4dff151244eba09eee79ae11c6121cc23181c2442
3
  size 871609293
checkpoint-5600/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: False
9
+ - load_in_4bit: True
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: nf4
15
+ - bnb_4bit_use_double_quant: True
16
+ - bnb_4bit_compute_dtype: bfloat16
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-5600/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "o_proj",
18
+ "k_proj",
19
+ "down_proj",
20
+ "gate_proj",
21
+ "up_proj",
22
+ "v_proj",
23
+ "q_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-5600/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a81912e202e5ef1e0abe63acfe2600dcb02f3768a37b847ef40a41ebbb64f69
3
+ size 871609293
checkpoint-5600/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
checkpoint-5600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f1ce7f0dd170bfeebb821db5f0cfcca98b3957b20a9caeefcb11d959a230f9e
3
+ size 873872799
checkpoint-5600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eacbdddf8408ff496013b66ade44228149b42f2f803cd158b398d7288028823
3
+ size 14511
checkpoint-5600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81248501833af563175f43c1d681185643b8411cee1fb1e631b8687c465eb2e3
3
+ size 627
checkpoint-5600/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "<unk>"
6
+ }
checkpoint-5600/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-5600/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": null,
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": null,
24
+ "padding_side": "right",
25
+ "sp_model_kwargs": {},
26
+ "tokenizer_class": "LlamaTokenizer",
27
+ "unk_token": {
28
+ "__type": "AddedToken",
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }
checkpoint-5600/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7d931ebfbcece1009124b9eae98d1a465edd703240c0655ee9bb17db395973
3
+ size 6011
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 0.04,
3
  "eval_loss": 6.335043907165527,
4
- "eval_runtime": 21.5795,
5
- "eval_samples_per_second": 2.317,
6
- "eval_steps_per_second": 1.159
7
  }
 
1
  {
2
  "epoch": 0.04,
3
  "eval_loss": 6.335043907165527,
4
+ "eval_runtime": 21.6378,
5
+ "eval_samples_per_second": 2.311,
6
+ "eval_steps_per_second": 1.155
7
  }
metrics.json CHANGED
@@ -1 +1 @@
1
- {"run_name": "codellama34b_unnatural", "train_runtime": 2886.6567, "train_samples_per_second": 10.393, "train_steps_per_second": 10.393, "train_loss": 1.2399261393149694, "epoch": 0.04, "eval_loss": 6.335043907165527, "eval_runtime": 21.5795, "eval_samples_per_second": 2.317, "eval_steps_per_second": 1.159}
 
1
+ {"run_name": "codellama34b_unnatural", "train_runtime": 1748.3273, "train_samples_per_second": 17.159, "train_steps_per_second": 17.159, "train_loss": 0.6445872698006807, "epoch": 0.04, "eval_loss": 6.335043907165527, "eval_runtime": 21.6378, "eval_samples_per_second": 2.311, "eval_steps_per_second": 1.155}
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 0.04,
3
- "train_loss": 1.2399261393149694,
4
- "train_runtime": 2886.6567,
5
- "train_samples_per_second": 10.393,
6
- "train_steps_per_second": 10.393
7
  }
 
1
  {
2
  "epoch": 0.04,
3
+ "train_loss": 0.6445872698006807,
4
+ "train_runtime": 1748.3273,
5
+ "train_samples_per_second": 17.159,
6
+ "train_steps_per_second": 17.159
7
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": 6.335043907165527,
3
  "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-4200",
4
- "epoch": 0.03666641203880529,
5
- "global_step": 4800,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -29265,11 +29265,3674 @@
29265
  "train_runtime": 2886.6567,
29266
  "train_samples_per_second": 10.393,
29267
  "train_steps_per_second": 10.393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29268
  }
29269
  ],
29270
  "max_steps": 30000,
29271
  "num_train_epochs": 1,
29272
- "total_flos": 7.930864121570918e+16,
29273
  "trial_name": null,
29274
  "trial_params": null
29275
  }
 
1
  {
2
  "best_metric": 6.335043907165527,
3
  "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-4200",
4
+ "epoch": 0.04124971354365595,
5
+ "global_step": 5400,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
29265
  "train_runtime": 2886.6567,
29266
  "train_samples_per_second": 10.393,
29267
  "train_steps_per_second": 10.393
29268
+ },
29269
+ {
29270
+ "epoch": 0.04,
29271
+ "learning_rate": 0.0004,
29272
+ "loss": 8.196,
29273
+ "step": 4801
29274
+ },
29275
+ {
29276
+ "epoch": 0.04,
29277
+ "learning_rate": 0.0004,
29278
+ "loss": 7.4766,
29279
+ "step": 4802
29280
+ },
29281
+ {
29282
+ "epoch": 0.04,
29283
+ "learning_rate": 0.0004,
29284
+ "loss": 7.5177,
29285
+ "step": 4803
29286
+ },
29287
+ {
29288
+ "epoch": 0.04,
29289
+ "learning_rate": 0.0004,
29290
+ "loss": 7.6057,
29291
+ "step": 4804
29292
+ },
29293
+ {
29294
+ "epoch": 0.04,
29295
+ "learning_rate": 0.0004,
29296
+ "loss": 7.9972,
29297
+ "step": 4805
29298
+ },
29299
+ {
29300
+ "epoch": 0.04,
29301
+ "learning_rate": 0.0004,
29302
+ "loss": 8.3521,
29303
+ "step": 4806
29304
+ },
29305
+ {
29306
+ "epoch": 0.04,
29307
+ "learning_rate": 0.0004,
29308
+ "loss": 7.9037,
29309
+ "step": 4807
29310
+ },
29311
+ {
29312
+ "epoch": 0.04,
29313
+ "learning_rate": 0.0004,
29314
+ "loss": 8.967,
29315
+ "step": 4808
29316
+ },
29317
+ {
29318
+ "epoch": 0.04,
29319
+ "learning_rate": 0.0004,
29320
+ "loss": 6.2405,
29321
+ "step": 4809
29322
+ },
29323
+ {
29324
+ "epoch": 0.04,
29325
+ "learning_rate": 0.0004,
29326
+ "loss": 8.5253,
29327
+ "step": 4810
29328
+ },
29329
+ {
29330
+ "epoch": 0.04,
29331
+ "learning_rate": 0.0004,
29332
+ "loss": 4.2458,
29333
+ "step": 4811
29334
+ },
29335
+ {
29336
+ "epoch": 0.04,
29337
+ "learning_rate": 0.0004,
29338
+ "loss": 7.179,
29339
+ "step": 4812
29340
+ },
29341
+ {
29342
+ "epoch": 0.04,
29343
+ "learning_rate": 0.0004,
29344
+ "loss": 7.6969,
29345
+ "step": 4813
29346
+ },
29347
+ {
29348
+ "epoch": 0.04,
29349
+ "learning_rate": 0.0004,
29350
+ "loss": 9.2289,
29351
+ "step": 4814
29352
+ },
29353
+ {
29354
+ "epoch": 0.04,
29355
+ "learning_rate": 0.0004,
29356
+ "loss": 7.0946,
29357
+ "step": 4815
29358
+ },
29359
+ {
29360
+ "epoch": 0.04,
29361
+ "learning_rate": 0.0004,
29362
+ "loss": 7.9045,
29363
+ "step": 4816
29364
+ },
29365
+ {
29366
+ "epoch": 0.04,
29367
+ "learning_rate": 0.0004,
29368
+ "loss": 9.6952,
29369
+ "step": 4817
29370
+ },
29371
+ {
29372
+ "epoch": 0.04,
29373
+ "learning_rate": 0.0004,
29374
+ "loss": 6.7265,
29375
+ "step": 4818
29376
+ },
29377
+ {
29378
+ "epoch": 0.04,
29379
+ "learning_rate": 0.0004,
29380
+ "loss": 2.9574,
29381
+ "step": 4819
29382
+ },
29383
+ {
29384
+ "epoch": 0.04,
29385
+ "learning_rate": 0.0004,
29386
+ "loss": 7.3774,
29387
+ "step": 4820
29388
+ },
29389
+ {
29390
+ "epoch": 0.04,
29391
+ "learning_rate": 0.0004,
29392
+ "loss": 6.7837,
29393
+ "step": 4821
29394
+ },
29395
+ {
29396
+ "epoch": 0.04,
29397
+ "learning_rate": 0.0004,
29398
+ "loss": 3.3796,
29399
+ "step": 4822
29400
+ },
29401
+ {
29402
+ "epoch": 0.04,
29403
+ "learning_rate": 0.0004,
29404
+ "loss": 6.4443,
29405
+ "step": 4823
29406
+ },
29407
+ {
29408
+ "epoch": 0.04,
29409
+ "learning_rate": 0.0004,
29410
+ "loss": 8.7734,
29411
+ "step": 4824
29412
+ },
29413
+ {
29414
+ "epoch": 0.04,
29415
+ "learning_rate": 0.0004,
29416
+ "loss": 9.5535,
29417
+ "step": 4825
29418
+ },
29419
+ {
29420
+ "epoch": 0.04,
29421
+ "learning_rate": 0.0004,
29422
+ "loss": 6.1014,
29423
+ "step": 4826
29424
+ },
29425
+ {
29426
+ "epoch": 0.04,
29427
+ "learning_rate": 0.0004,
29428
+ "loss": 3.5574,
29429
+ "step": 4827
29430
+ },
29431
+ {
29432
+ "epoch": 0.04,
29433
+ "learning_rate": 0.0004,
29434
+ "loss": 4.5114,
29435
+ "step": 4828
29436
+ },
29437
+ {
29438
+ "epoch": 0.04,
29439
+ "learning_rate": 0.0004,
29440
+ "loss": 3.431,
29441
+ "step": 4829
29442
+ },
29443
+ {
29444
+ "epoch": 0.04,
29445
+ "learning_rate": 0.0004,
29446
+ "loss": 7.8042,
29447
+ "step": 4830
29448
+ },
29449
+ {
29450
+ "epoch": 0.04,
29451
+ "learning_rate": 0.0004,
29452
+ "loss": 2.4997,
29453
+ "step": 4831
29454
+ },
29455
+ {
29456
+ "epoch": 0.04,
29457
+ "learning_rate": 0.0004,
29458
+ "loss": 7.9027,
29459
+ "step": 4832
29460
+ },
29461
+ {
29462
+ "epoch": 0.04,
29463
+ "learning_rate": 0.0004,
29464
+ "loss": 4.7126,
29465
+ "step": 4833
29466
+ },
29467
+ {
29468
+ "epoch": 0.04,
29469
+ "learning_rate": 0.0004,
29470
+ "loss": 2.8638,
29471
+ "step": 4834
29472
+ },
29473
+ {
29474
+ "epoch": 0.04,
29475
+ "learning_rate": 0.0004,
29476
+ "loss": 6.4997,
29477
+ "step": 4835
29478
+ },
29479
+ {
29480
+ "epoch": 0.04,
29481
+ "learning_rate": 0.0004,
29482
+ "loss": 4.8501,
29483
+ "step": 4836
29484
+ },
29485
+ {
29486
+ "epoch": 0.04,
29487
+ "learning_rate": 0.0004,
29488
+ "loss": 2.6346,
29489
+ "step": 4837
29490
+ },
29491
+ {
29492
+ "epoch": 0.04,
29493
+ "learning_rate": 0.0004,
29494
+ "loss": 2.8403,
29495
+ "step": 4838
29496
+ },
29497
+ {
29498
+ "epoch": 0.04,
29499
+ "learning_rate": 0.0004,
29500
+ "loss": 6.8362,
29501
+ "step": 4839
29502
+ },
29503
+ {
29504
+ "epoch": 0.04,
29505
+ "learning_rate": 0.0004,
29506
+ "loss": 2.8393,
29507
+ "step": 4840
29508
+ },
29509
+ {
29510
+ "epoch": 0.04,
29511
+ "learning_rate": 0.0004,
29512
+ "loss": 2.6428,
29513
+ "step": 4841
29514
+ },
29515
+ {
29516
+ "epoch": 0.04,
29517
+ "learning_rate": 0.0004,
29518
+ "loss": 5.9946,
29519
+ "step": 4842
29520
+ },
29521
+ {
29522
+ "epoch": 0.04,
29523
+ "learning_rate": 0.0004,
29524
+ "loss": 4.3163,
29525
+ "step": 4843
29526
+ },
29527
+ {
29528
+ "epoch": 0.04,
29529
+ "learning_rate": 0.0004,
29530
+ "loss": 6.9659,
29531
+ "step": 4844
29532
+ },
29533
+ {
29534
+ "epoch": 0.04,
29535
+ "learning_rate": 0.0004,
29536
+ "loss": 6.5787,
29537
+ "step": 4845
29538
+ },
29539
+ {
29540
+ "epoch": 0.04,
29541
+ "learning_rate": 0.0004,
29542
+ "loss": 9.0435,
29543
+ "step": 4846
29544
+ },
29545
+ {
29546
+ "epoch": 0.04,
29547
+ "learning_rate": 0.0004,
29548
+ "loss": 5.6627,
29549
+ "step": 4847
29550
+ },
29551
+ {
29552
+ "epoch": 0.04,
29553
+ "learning_rate": 0.0004,
29554
+ "loss": 7.0435,
29555
+ "step": 4848
29556
+ },
29557
+ {
29558
+ "epoch": 0.04,
29559
+ "learning_rate": 0.0004,
29560
+ "loss": 7.584,
29561
+ "step": 4849
29562
+ },
29563
+ {
29564
+ "epoch": 0.04,
29565
+ "learning_rate": 0.0004,
29566
+ "loss": 5.5761,
29567
+ "step": 4850
29568
+ },
29569
+ {
29570
+ "epoch": 0.04,
29571
+ "learning_rate": 0.0004,
29572
+ "loss": 8.0644,
29573
+ "step": 4851
29574
+ },
29575
+ {
29576
+ "epoch": 0.04,
29577
+ "learning_rate": 0.0004,
29578
+ "loss": 6.7897,
29579
+ "step": 4852
29580
+ },
29581
+ {
29582
+ "epoch": 0.04,
29583
+ "learning_rate": 0.0004,
29584
+ "loss": 7.7933,
29585
+ "step": 4853
29586
+ },
29587
+ {
29588
+ "epoch": 0.04,
29589
+ "learning_rate": 0.0004,
29590
+ "loss": 8.0918,
29591
+ "step": 4854
29592
+ },
29593
+ {
29594
+ "epoch": 0.04,
29595
+ "learning_rate": 0.0004,
29596
+ "loss": 8.1191,
29597
+ "step": 4855
29598
+ },
29599
+ {
29600
+ "epoch": 0.04,
29601
+ "learning_rate": 0.0004,
29602
+ "loss": 5.6498,
29603
+ "step": 4856
29604
+ },
29605
+ {
29606
+ "epoch": 0.04,
29607
+ "learning_rate": 0.0004,
29608
+ "loss": 3.1834,
29609
+ "step": 4857
29610
+ },
29611
+ {
29612
+ "epoch": 0.04,
29613
+ "learning_rate": 0.0004,
29614
+ "loss": 7.5713,
29615
+ "step": 4858
29616
+ },
29617
+ {
29618
+ "epoch": 0.04,
29619
+ "learning_rate": 0.0004,
29620
+ "loss": 6.643,
29621
+ "step": 4859
29622
+ },
29623
+ {
29624
+ "epoch": 0.04,
29625
+ "learning_rate": 0.0004,
29626
+ "loss": 6.0051,
29627
+ "step": 4860
29628
+ },
29629
+ {
29630
+ "epoch": 0.04,
29631
+ "learning_rate": 0.0004,
29632
+ "loss": 5.3192,
29633
+ "step": 4861
29634
+ },
29635
+ {
29636
+ "epoch": 0.04,
29637
+ "learning_rate": 0.0004,
29638
+ "loss": 6.6787,
29639
+ "step": 4862
29640
+ },
29641
+ {
29642
+ "epoch": 0.04,
29643
+ "learning_rate": 0.0004,
29644
+ "loss": 6.1336,
29645
+ "step": 4863
29646
+ },
29647
+ {
29648
+ "epoch": 0.04,
29649
+ "learning_rate": 0.0004,
29650
+ "loss": 4.1196,
29651
+ "step": 4864
29652
+ },
29653
+ {
29654
+ "epoch": 0.04,
29655
+ "learning_rate": 0.0004,
29656
+ "loss": 7.7662,
29657
+ "step": 4865
29658
+ },
29659
+ {
29660
+ "epoch": 0.04,
29661
+ "learning_rate": 0.0004,
29662
+ "loss": 6.5099,
29663
+ "step": 4866
29664
+ },
29665
+ {
29666
+ "epoch": 0.04,
29667
+ "learning_rate": 0.0004,
29668
+ "loss": 6.4698,
29669
+ "step": 4867
29670
+ },
29671
+ {
29672
+ "epoch": 0.04,
29673
+ "learning_rate": 0.0004,
29674
+ "loss": 2.9245,
29675
+ "step": 4868
29676
+ },
29677
+ {
29678
+ "epoch": 0.04,
29679
+ "learning_rate": 0.0004,
29680
+ "loss": 3.0627,
29681
+ "step": 4869
29682
+ },
29683
+ {
29684
+ "epoch": 0.04,
29685
+ "learning_rate": 0.0004,
29686
+ "loss": 4.4951,
29687
+ "step": 4870
29688
+ },
29689
+ {
29690
+ "epoch": 0.04,
29691
+ "learning_rate": 0.0004,
29692
+ "loss": 9.7325,
29693
+ "step": 4871
29694
+ },
29695
+ {
29696
+ "epoch": 0.04,
29697
+ "learning_rate": 0.0004,
29698
+ "loss": 5.3552,
29699
+ "step": 4872
29700
+ },
29701
+ {
29702
+ "epoch": 0.04,
29703
+ "learning_rate": 0.0004,
29704
+ "loss": 4.0359,
29705
+ "step": 4873
29706
+ },
29707
+ {
29708
+ "epoch": 0.04,
29709
+ "learning_rate": 0.0004,
29710
+ "loss": 5.259,
29711
+ "step": 4874
29712
+ },
29713
+ {
29714
+ "epoch": 0.04,
29715
+ "learning_rate": 0.0004,
29716
+ "loss": 7.39,
29717
+ "step": 4875
29718
+ },
29719
+ {
29720
+ "epoch": 0.04,
29721
+ "learning_rate": 0.0004,
29722
+ "loss": 5.2283,
29723
+ "step": 4876
29724
+ },
29725
+ {
29726
+ "epoch": 0.04,
29727
+ "learning_rate": 0.0004,
29728
+ "loss": 7.2338,
29729
+ "step": 4877
29730
+ },
29731
+ {
29732
+ "epoch": 0.04,
29733
+ "learning_rate": 0.0004,
29734
+ "loss": 4.5157,
29735
+ "step": 4878
29736
+ },
29737
+ {
29738
+ "epoch": 0.04,
29739
+ "learning_rate": 0.0004,
29740
+ "loss": 6.7822,
29741
+ "step": 4879
29742
+ },
29743
+ {
29744
+ "epoch": 0.04,
29745
+ "learning_rate": 0.0004,
29746
+ "loss": 5.4882,
29747
+ "step": 4880
29748
+ },
29749
+ {
29750
+ "epoch": 0.04,
29751
+ "learning_rate": 0.0004,
29752
+ "loss": 2.9394,
29753
+ "step": 4881
29754
+ },
29755
+ {
29756
+ "epoch": 0.04,
29757
+ "learning_rate": 0.0004,
29758
+ "loss": 9.7844,
29759
+ "step": 4882
29760
+ },
29761
+ {
29762
+ "epoch": 0.04,
29763
+ "learning_rate": 0.0004,
29764
+ "loss": 4.0808,
29765
+ "step": 4883
29766
+ },
29767
+ {
29768
+ "epoch": 0.04,
29769
+ "learning_rate": 0.0004,
29770
+ "loss": 5.4349,
29771
+ "step": 4884
29772
+ },
29773
+ {
29774
+ "epoch": 0.04,
29775
+ "learning_rate": 0.0004,
29776
+ "loss": 4.7747,
29777
+ "step": 4885
29778
+ },
29779
+ {
29780
+ "epoch": 0.04,
29781
+ "learning_rate": 0.0004,
29782
+ "loss": 3.2778,
29783
+ "step": 4886
29784
+ },
29785
+ {
29786
+ "epoch": 0.04,
29787
+ "learning_rate": 0.0004,
29788
+ "loss": 7.9457,
29789
+ "step": 4887
29790
+ },
29791
+ {
29792
+ "epoch": 0.04,
29793
+ "learning_rate": 0.0004,
29794
+ "loss": 3.196,
29795
+ "step": 4888
29796
+ },
29797
+ {
29798
+ "epoch": 0.04,
29799
+ "learning_rate": 0.0004,
29800
+ "loss": 4.2576,
29801
+ "step": 4889
29802
+ },
29803
+ {
29804
+ "epoch": 0.04,
29805
+ "learning_rate": 0.0004,
29806
+ "loss": 6.3978,
29807
+ "step": 4890
29808
+ },
29809
+ {
29810
+ "epoch": 0.04,
29811
+ "learning_rate": 0.0004,
29812
+ "loss": 7.4963,
29813
+ "step": 4891
29814
+ },
29815
+ {
29816
+ "epoch": 0.04,
29817
+ "learning_rate": 0.0004,
29818
+ "loss": 8.8915,
29819
+ "step": 4892
29820
+ },
29821
+ {
29822
+ "epoch": 0.04,
29823
+ "learning_rate": 0.0004,
29824
+ "loss": 2.8344,
29825
+ "step": 4893
29826
+ },
29827
+ {
29828
+ "epoch": 0.04,
29829
+ "learning_rate": 0.0004,
29830
+ "loss": 6.5248,
29831
+ "step": 4894
29832
+ },
29833
+ {
29834
+ "epoch": 0.04,
29835
+ "learning_rate": 0.0004,
29836
+ "loss": 2.9729,
29837
+ "step": 4895
29838
+ },
29839
+ {
29840
+ "epoch": 0.04,
29841
+ "learning_rate": 0.0004,
29842
+ "loss": 2.7504,
29843
+ "step": 4896
29844
+ },
29845
+ {
29846
+ "epoch": 0.04,
29847
+ "learning_rate": 0.0004,
29848
+ "loss": 4.4288,
29849
+ "step": 4897
29850
+ },
29851
+ {
29852
+ "epoch": 0.04,
29853
+ "learning_rate": 0.0004,
29854
+ "loss": 5.173,
29855
+ "step": 4898
29856
+ },
29857
+ {
29858
+ "epoch": 0.04,
29859
+ "learning_rate": 0.0004,
29860
+ "loss": 2.6288,
29861
+ "step": 4899
29862
+ },
29863
+ {
29864
+ "epoch": 0.04,
29865
+ "learning_rate": 0.0004,
29866
+ "loss": 7.3934,
29867
+ "step": 4900
29868
+ },
29869
+ {
29870
+ "epoch": 0.04,
29871
+ "learning_rate": 0.0004,
29872
+ "loss": 2.8056,
29873
+ "step": 4901
29874
+ },
29875
+ {
29876
+ "epoch": 0.04,
29877
+ "learning_rate": 0.0004,
29878
+ "loss": 6.7523,
29879
+ "step": 4902
29880
+ },
29881
+ {
29882
+ "epoch": 0.04,
29883
+ "learning_rate": 0.0004,
29884
+ "loss": 7.6066,
29885
+ "step": 4903
29886
+ },
29887
+ {
29888
+ "epoch": 0.04,
29889
+ "learning_rate": 0.0004,
29890
+ "loss": 8.6161,
29891
+ "step": 4904
29892
+ },
29893
+ {
29894
+ "epoch": 0.04,
29895
+ "learning_rate": 0.0004,
29896
+ "loss": 7.8099,
29897
+ "step": 4905
29898
+ },
29899
+ {
29900
+ "epoch": 0.04,
29901
+ "learning_rate": 0.0004,
29902
+ "loss": 5.2048,
29903
+ "step": 4906
29904
+ },
29905
+ {
29906
+ "epoch": 0.04,
29907
+ "learning_rate": 0.0004,
29908
+ "loss": 2.6112,
29909
+ "step": 4907
29910
+ },
29911
+ {
29912
+ "epoch": 0.04,
29913
+ "learning_rate": 0.0004,
29914
+ "loss": 8.5394,
29915
+ "step": 4908
29916
+ },
29917
+ {
29918
+ "epoch": 0.04,
29919
+ "learning_rate": 0.0004,
29920
+ "loss": 7.9661,
29921
+ "step": 4909
29922
+ },
29923
+ {
29924
+ "epoch": 0.04,
29925
+ "learning_rate": 0.0004,
29926
+ "loss": 7.1945,
29927
+ "step": 4910
29928
+ },
29929
+ {
29930
+ "epoch": 0.04,
29931
+ "learning_rate": 0.0004,
29932
+ "loss": 7.1823,
29933
+ "step": 4911
29934
+ },
29935
+ {
29936
+ "epoch": 0.04,
29937
+ "learning_rate": 0.0004,
29938
+ "loss": 6.5774,
29939
+ "step": 4912
29940
+ },
29941
+ {
29942
+ "epoch": 0.04,
29943
+ "learning_rate": 0.0004,
29944
+ "loss": 7.3444,
29945
+ "step": 4913
29946
+ },
29947
+ {
29948
+ "epoch": 0.04,
29949
+ "learning_rate": 0.0004,
29950
+ "loss": 7.8732,
29951
+ "step": 4914
29952
+ },
29953
+ {
29954
+ "epoch": 0.04,
29955
+ "learning_rate": 0.0004,
29956
+ "loss": 3.5685,
29957
+ "step": 4915
29958
+ },
29959
+ {
29960
+ "epoch": 0.04,
29961
+ "learning_rate": 0.0004,
29962
+ "loss": 4.6944,
29963
+ "step": 4916
29964
+ },
29965
+ {
29966
+ "epoch": 0.04,
29967
+ "learning_rate": 0.0004,
29968
+ "loss": 8.9668,
29969
+ "step": 4917
29970
+ },
29971
+ {
29972
+ "epoch": 0.04,
29973
+ "learning_rate": 0.0004,
29974
+ "loss": 2.9854,
29975
+ "step": 4918
29976
+ },
29977
+ {
29978
+ "epoch": 0.04,
29979
+ "learning_rate": 0.0004,
29980
+ "loss": 9.0986,
29981
+ "step": 4919
29982
+ },
29983
+ {
29984
+ "epoch": 0.04,
29985
+ "learning_rate": 0.0004,
29986
+ "loss": 6.8546,
29987
+ "step": 4920
29988
+ },
29989
+ {
29990
+ "epoch": 0.04,
29991
+ "learning_rate": 0.0004,
29992
+ "loss": 2.658,
29993
+ "step": 4921
29994
+ },
29995
+ {
29996
+ "epoch": 0.04,
29997
+ "learning_rate": 0.0004,
29998
+ "loss": 2.8595,
29999
+ "step": 4922
30000
+ },
30001
+ {
30002
+ "epoch": 0.04,
30003
+ "learning_rate": 0.0004,
30004
+ "loss": 6.3526,
30005
+ "step": 4923
30006
+ },
30007
+ {
30008
+ "epoch": 0.04,
30009
+ "learning_rate": 0.0004,
30010
+ "loss": 6.6612,
30011
+ "step": 4924
30012
+ },
30013
+ {
30014
+ "epoch": 0.04,
30015
+ "learning_rate": 0.0004,
30016
+ "loss": 6.4798,
30017
+ "step": 4925
30018
+ },
30019
+ {
30020
+ "epoch": 0.04,
30021
+ "learning_rate": 0.0004,
30022
+ "loss": 6.0779,
30023
+ "step": 4926
30024
+ },
30025
+ {
30026
+ "epoch": 0.04,
30027
+ "learning_rate": 0.0004,
30028
+ "loss": 2.8211,
30029
+ "step": 4927
30030
+ },
30031
+ {
30032
+ "epoch": 0.04,
30033
+ "learning_rate": 0.0004,
30034
+ "loss": 7.9007,
30035
+ "step": 4928
30036
+ },
30037
+ {
30038
+ "epoch": 0.04,
30039
+ "learning_rate": 0.0004,
30040
+ "loss": 2.5789,
30041
+ "step": 4929
30042
+ },
30043
+ {
30044
+ "epoch": 0.04,
30045
+ "learning_rate": 0.0004,
30046
+ "loss": 8.0357,
30047
+ "step": 4930
30048
+ },
30049
+ {
30050
+ "epoch": 0.04,
30051
+ "learning_rate": 0.0004,
30052
+ "loss": 6.8846,
30053
+ "step": 4931
30054
+ },
30055
+ {
30056
+ "epoch": 0.04,
30057
+ "learning_rate": 0.0004,
30058
+ "loss": 5.7409,
30059
+ "step": 4932
30060
+ },
30061
+ {
30062
+ "epoch": 0.04,
30063
+ "learning_rate": 0.0004,
30064
+ "loss": 8.4081,
30065
+ "step": 4933
30066
+ },
30067
+ {
30068
+ "epoch": 0.04,
30069
+ "learning_rate": 0.0004,
30070
+ "loss": 7.3187,
30071
+ "step": 4934
30072
+ },
30073
+ {
30074
+ "epoch": 0.04,
30075
+ "learning_rate": 0.0004,
30076
+ "loss": 8.1926,
30077
+ "step": 4935
30078
+ },
30079
+ {
30080
+ "epoch": 0.04,
30081
+ "learning_rate": 0.0004,
30082
+ "loss": 8.2912,
30083
+ "step": 4936
30084
+ },
30085
+ {
30086
+ "epoch": 0.04,
30087
+ "learning_rate": 0.0004,
30088
+ "loss": 6.6701,
30089
+ "step": 4937
30090
+ },
30091
+ {
30092
+ "epoch": 0.04,
30093
+ "learning_rate": 0.0004,
30094
+ "loss": 4.8162,
30095
+ "step": 4938
30096
+ },
30097
+ {
30098
+ "epoch": 0.04,
30099
+ "learning_rate": 0.0004,
30100
+ "loss": 2.7585,
30101
+ "step": 4939
30102
+ },
30103
+ {
30104
+ "epoch": 0.04,
30105
+ "learning_rate": 0.0004,
30106
+ "loss": 6.6232,
30107
+ "step": 4940
30108
+ },
30109
+ {
30110
+ "epoch": 0.04,
30111
+ "learning_rate": 0.0004,
30112
+ "loss": 7.9613,
30113
+ "step": 4941
30114
+ },
30115
+ {
30116
+ "epoch": 0.04,
30117
+ "learning_rate": 0.0004,
30118
+ "loss": 3.954,
30119
+ "step": 4942
30120
+ },
30121
+ {
30122
+ "epoch": 0.04,
30123
+ "learning_rate": 0.0004,
30124
+ "loss": 2.7287,
30125
+ "step": 4943
30126
+ },
30127
+ {
30128
+ "epoch": 0.04,
30129
+ "learning_rate": 0.0004,
30130
+ "loss": 4.6305,
30131
+ "step": 4944
30132
+ },
30133
+ {
30134
+ "epoch": 0.04,
30135
+ "learning_rate": 0.0004,
30136
+ "loss": 2.6932,
30137
+ "step": 4945
30138
+ },
30139
+ {
30140
+ "epoch": 0.04,
30141
+ "learning_rate": 0.0004,
30142
+ "loss": 2.6798,
30143
+ "step": 4946
30144
+ },
30145
+ {
30146
+ "epoch": 0.04,
30147
+ "learning_rate": 0.0004,
30148
+ "loss": 3.6665,
30149
+ "step": 4947
30150
+ },
30151
+ {
30152
+ "epoch": 0.04,
30153
+ "learning_rate": 0.0004,
30154
+ "loss": 6.1462,
30155
+ "step": 4948
30156
+ },
30157
+ {
30158
+ "epoch": 0.04,
30159
+ "learning_rate": 0.0004,
30160
+ "loss": 4.0676,
30161
+ "step": 4949
30162
+ },
30163
+ {
30164
+ "epoch": 0.04,
30165
+ "learning_rate": 0.0004,
30166
+ "loss": 4.2834,
30167
+ "step": 4950
30168
+ },
30169
+ {
30170
+ "epoch": 0.04,
30171
+ "learning_rate": 0.0004,
30172
+ "loss": 5.8273,
30173
+ "step": 4951
30174
+ },
30175
+ {
30176
+ "epoch": 0.04,
30177
+ "learning_rate": 0.0004,
30178
+ "loss": 7.635,
30179
+ "step": 4952
30180
+ },
30181
+ {
30182
+ "epoch": 0.04,
30183
+ "learning_rate": 0.0004,
30184
+ "loss": 8.9245,
30185
+ "step": 4953
30186
+ },
30187
+ {
30188
+ "epoch": 0.04,
30189
+ "learning_rate": 0.0004,
30190
+ "loss": 8.5401,
30191
+ "step": 4954
30192
+ },
30193
+ {
30194
+ "epoch": 0.04,
30195
+ "learning_rate": 0.0004,
30196
+ "loss": 8.2944,
30197
+ "step": 4955
30198
+ },
30199
+ {
30200
+ "epoch": 0.04,
30201
+ "learning_rate": 0.0004,
30202
+ "loss": 6.6151,
30203
+ "step": 4956
30204
+ },
30205
+ {
30206
+ "epoch": 0.04,
30207
+ "learning_rate": 0.0004,
30208
+ "loss": 4.3668,
30209
+ "step": 4957
30210
+ },
30211
+ {
30212
+ "epoch": 0.04,
30213
+ "learning_rate": 0.0004,
30214
+ "loss": 7.4506,
30215
+ "step": 4958
30216
+ },
30217
+ {
30218
+ "epoch": 0.04,
30219
+ "learning_rate": 0.0004,
30220
+ "loss": 7.8919,
30221
+ "step": 4959
30222
+ },
30223
+ {
30224
+ "epoch": 0.04,
30225
+ "learning_rate": 0.0004,
30226
+ "loss": 8.7462,
30227
+ "step": 4960
30228
+ },
30229
+ {
30230
+ "epoch": 0.04,
30231
+ "learning_rate": 0.0004,
30232
+ "loss": 5.8915,
30233
+ "step": 4961
30234
+ },
30235
+ {
30236
+ "epoch": 0.04,
30237
+ "learning_rate": 0.0004,
30238
+ "loss": 7.4696,
30239
+ "step": 4962
30240
+ },
30241
+ {
30242
+ "epoch": 0.04,
30243
+ "learning_rate": 0.0004,
30244
+ "loss": 7.0112,
30245
+ "step": 4963
30246
+ },
30247
+ {
30248
+ "epoch": 0.04,
30249
+ "learning_rate": 0.0004,
30250
+ "loss": 8.1888,
30251
+ "step": 4964
30252
+ },
30253
+ {
30254
+ "epoch": 0.04,
30255
+ "learning_rate": 0.0004,
30256
+ "loss": 7.1465,
30257
+ "step": 4965
30258
+ },
30259
+ {
30260
+ "epoch": 0.04,
30261
+ "learning_rate": 0.0004,
30262
+ "loss": 3.6028,
30263
+ "step": 4966
30264
+ },
30265
+ {
30266
+ "epoch": 0.04,
30267
+ "learning_rate": 0.0004,
30268
+ "loss": 3.279,
30269
+ "step": 4967
30270
+ },
30271
+ {
30272
+ "epoch": 0.04,
30273
+ "learning_rate": 0.0004,
30274
+ "loss": 6.4619,
30275
+ "step": 4968
30276
+ },
30277
+ {
30278
+ "epoch": 0.04,
30279
+ "learning_rate": 0.0004,
30280
+ "loss": 6.7617,
30281
+ "step": 4969
30282
+ },
30283
+ {
30284
+ "epoch": 0.04,
30285
+ "learning_rate": 0.0004,
30286
+ "loss": 8.0521,
30287
+ "step": 4970
30288
+ },
30289
+ {
30290
+ "epoch": 0.04,
30291
+ "learning_rate": 0.0004,
30292
+ "loss": 3.9583,
30293
+ "step": 4971
30294
+ },
30295
+ {
30296
+ "epoch": 0.04,
30297
+ "learning_rate": 0.0004,
30298
+ "loss": 8.5725,
30299
+ "step": 4972
30300
+ },
30301
+ {
30302
+ "epoch": 0.04,
30303
+ "learning_rate": 0.0004,
30304
+ "loss": 6.3248,
30305
+ "step": 4973
30306
+ },
30307
+ {
30308
+ "epoch": 0.04,
30309
+ "learning_rate": 0.0004,
30310
+ "loss": 2.9984,
30311
+ "step": 4974
30312
+ },
30313
+ {
30314
+ "epoch": 0.04,
30315
+ "learning_rate": 0.0004,
30316
+ "loss": 5.7955,
30317
+ "step": 4975
30318
+ },
30319
+ {
30320
+ "epoch": 0.04,
30321
+ "learning_rate": 0.0004,
30322
+ "loss": 5.4351,
30323
+ "step": 4976
30324
+ },
30325
+ {
30326
+ "epoch": 0.04,
30327
+ "learning_rate": 0.0004,
30328
+ "loss": 3.5412,
30329
+ "step": 4977
30330
+ },
30331
+ {
30332
+ "epoch": 0.04,
30333
+ "learning_rate": 0.0004,
30334
+ "loss": 9.4986,
30335
+ "step": 4978
30336
+ },
30337
+ {
30338
+ "epoch": 0.04,
30339
+ "learning_rate": 0.0004,
30340
+ "loss": 3.4686,
30341
+ "step": 4979
30342
+ },
30343
+ {
30344
+ "epoch": 0.04,
30345
+ "learning_rate": 0.0004,
30346
+ "loss": 6.3709,
30347
+ "step": 4980
30348
+ },
30349
+ {
30350
+ "epoch": 0.04,
30351
+ "learning_rate": 0.0004,
30352
+ "loss": 3.4326,
30353
+ "step": 4981
30354
+ },
30355
+ {
30356
+ "epoch": 0.04,
30357
+ "learning_rate": 0.0004,
30358
+ "loss": 5.3118,
30359
+ "step": 4982
30360
+ },
30361
+ {
30362
+ "epoch": 0.04,
30363
+ "learning_rate": 0.0004,
30364
+ "loss": 6.2933,
30365
+ "step": 4983
30366
+ },
30367
+ {
30368
+ "epoch": 0.04,
30369
+ "learning_rate": 0.0004,
30370
+ "loss": 7.2728,
30371
+ "step": 4984
30372
+ },
30373
+ {
30374
+ "epoch": 0.04,
30375
+ "learning_rate": 0.0004,
30376
+ "loss": 5.5518,
30377
+ "step": 4985
30378
+ },
30379
+ {
30380
+ "epoch": 0.04,
30381
+ "learning_rate": 0.0004,
30382
+ "loss": 5.8085,
30383
+ "step": 4986
30384
+ },
30385
+ {
30386
+ "epoch": 0.04,
30387
+ "learning_rate": 0.0004,
30388
+ "loss": 6.024,
30389
+ "step": 4987
30390
+ },
30391
+ {
30392
+ "epoch": 0.04,
30393
+ "learning_rate": 0.0004,
30394
+ "loss": 6.7633,
30395
+ "step": 4988
30396
+ },
30397
+ {
30398
+ "epoch": 0.04,
30399
+ "learning_rate": 0.0004,
30400
+ "loss": 3.9099,
30401
+ "step": 4989
30402
+ },
30403
+ {
30404
+ "epoch": 0.04,
30405
+ "learning_rate": 0.0004,
30406
+ "loss": 3.0304,
30407
+ "step": 4990
30408
+ },
30409
+ {
30410
+ "epoch": 0.04,
30411
+ "learning_rate": 0.0004,
30412
+ "loss": 5.939,
30413
+ "step": 4991
30414
+ },
30415
+ {
30416
+ "epoch": 0.04,
30417
+ "learning_rate": 0.0004,
30418
+ "loss": 3.1024,
30419
+ "step": 4992
30420
+ },
30421
+ {
30422
+ "epoch": 0.04,
30423
+ "learning_rate": 0.0004,
30424
+ "loss": 3.2432,
30425
+ "step": 4993
30426
+ },
30427
+ {
30428
+ "epoch": 0.04,
30429
+ "learning_rate": 0.0004,
30430
+ "loss": 6.9213,
30431
+ "step": 4994
30432
+ },
30433
+ {
30434
+ "epoch": 0.04,
30435
+ "learning_rate": 0.0004,
30436
+ "loss": 4.6644,
30437
+ "step": 4995
30438
+ },
30439
+ {
30440
+ "epoch": 0.04,
30441
+ "learning_rate": 0.0004,
30442
+ "loss": 5.9821,
30443
+ "step": 4996
30444
+ },
30445
+ {
30446
+ "epoch": 0.04,
30447
+ "learning_rate": 0.0004,
30448
+ "loss": 4.9677,
30449
+ "step": 4997
30450
+ },
30451
+ {
30452
+ "epoch": 0.04,
30453
+ "learning_rate": 0.0004,
30454
+ "loss": 6.7992,
30455
+ "step": 4998
30456
+ },
30457
+ {
30458
+ "epoch": 0.04,
30459
+ "learning_rate": 0.0004,
30460
+ "loss": 3.2743,
30461
+ "step": 4999
30462
+ },
30463
+ {
30464
+ "epoch": 0.04,
30465
+ "learning_rate": 0.0004,
30466
+ "loss": 4.5054,
30467
+ "step": 5000
30468
+ },
30469
+ {
30470
+ "epoch": 0.04,
30471
+ "eval_loss": 6.463876247406006,
30472
+ "eval_runtime": 22.4171,
30473
+ "eval_samples_per_second": 2.23,
30474
+ "eval_steps_per_second": 1.115,
30475
+ "step": 5000
30476
+ },
30477
+ {
30478
+ "epoch": 0.04,
30479
+ "mmlu_eval_accuracy": 0.2525477994227994,
30480
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
30481
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
30482
+ "mmlu_eval_accuracy_astronomy": 0.3125,
30483
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
30484
+ "mmlu_loss": 3.4964506435394287,
30485
+ "step": 5000
30486
+ },
30487
+ {
30488
+ "epoch": 0.04,
30489
+ "learning_rate": 0.0004,
30490
+ "loss": 7.8989,
30491
+ "step": 5001
30492
+ },
30493
+ {
30494
+ "epoch": 0.04,
30495
+ "learning_rate": 0.0004,
30496
+ "loss": 8.045,
30497
+ "step": 5002
30498
+ },
30499
+ {
30500
+ "epoch": 0.04,
30501
+ "learning_rate": 0.0004,
30502
+ "loss": 8.2012,
30503
+ "step": 5003
30504
+ },
30505
+ {
30506
+ "epoch": 0.04,
30507
+ "learning_rate": 0.0004,
30508
+ "loss": 7.5305,
30509
+ "step": 5004
30510
+ },
30511
+ {
30512
+ "epoch": 0.04,
30513
+ "learning_rate": 0.0004,
30514
+ "loss": 7.2522,
30515
+ "step": 5005
30516
+ },
30517
+ {
30518
+ "epoch": 0.04,
30519
+ "learning_rate": 0.0004,
30520
+ "loss": 7.5176,
30521
+ "step": 5006
30522
+ },
30523
+ {
30524
+ "epoch": 0.04,
30525
+ "learning_rate": 0.0004,
30526
+ "loss": 6.9333,
30527
+ "step": 5007
30528
+ },
30529
+ {
30530
+ "epoch": 0.04,
30531
+ "learning_rate": 0.0004,
30532
+ "loss": 5.6115,
30533
+ "step": 5008
30534
+ },
30535
+ {
30536
+ "epoch": 0.04,
30537
+ "learning_rate": 0.0004,
30538
+ "loss": 7.2527,
30539
+ "step": 5009
30540
+ },
30541
+ {
30542
+ "epoch": 0.04,
30543
+ "learning_rate": 0.0004,
30544
+ "loss": 7.3046,
30545
+ "step": 5010
30546
+ },
30547
+ {
30548
+ "epoch": 0.04,
30549
+ "learning_rate": 0.0004,
30550
+ "loss": 6.2746,
30551
+ "step": 5011
30552
+ },
30553
+ {
30554
+ "epoch": 0.04,
30555
+ "learning_rate": 0.0004,
30556
+ "loss": 5.8078,
30557
+ "step": 5012
30558
+ },
30559
+ {
30560
+ "epoch": 0.04,
30561
+ "learning_rate": 0.0004,
30562
+ "loss": 5.0053,
30563
+ "step": 5013
30564
+ },
30565
+ {
30566
+ "epoch": 0.04,
30567
+ "learning_rate": 0.0004,
30568
+ "loss": 8.3415,
30569
+ "step": 5014
30570
+ },
30571
+ {
30572
+ "epoch": 0.04,
30573
+ "learning_rate": 0.0004,
30574
+ "loss": 8.1199,
30575
+ "step": 5015
30576
+ },
30577
+ {
30578
+ "epoch": 0.04,
30579
+ "learning_rate": 0.0004,
30580
+ "loss": 5.6292,
30581
+ "step": 5016
30582
+ },
30583
+ {
30584
+ "epoch": 0.04,
30585
+ "learning_rate": 0.0004,
30586
+ "loss": 3.5195,
30587
+ "step": 5017
30588
+ },
30589
+ {
30590
+ "epoch": 0.04,
30591
+ "learning_rate": 0.0004,
30592
+ "loss": 7.2367,
30593
+ "step": 5018
30594
+ },
30595
+ {
30596
+ "epoch": 0.04,
30597
+ "learning_rate": 0.0004,
30598
+ "loss": 4.219,
30599
+ "step": 5019
30600
+ },
30601
+ {
30602
+ "epoch": 0.04,
30603
+ "learning_rate": 0.0004,
30604
+ "loss": 3.4376,
30605
+ "step": 5020
30606
+ },
30607
+ {
30608
+ "epoch": 0.04,
30609
+ "learning_rate": 0.0004,
30610
+ "loss": 4.1413,
30611
+ "step": 5021
30612
+ },
30613
+ {
30614
+ "epoch": 0.04,
30615
+ "learning_rate": 0.0004,
30616
+ "loss": 5.7102,
30617
+ "step": 5022
30618
+ },
30619
+ {
30620
+ "epoch": 0.04,
30621
+ "learning_rate": 0.0004,
30622
+ "loss": 3.3297,
30623
+ "step": 5023
30624
+ },
30625
+ {
30626
+ "epoch": 0.04,
30627
+ "learning_rate": 0.0004,
30628
+ "loss": 8.9923,
30629
+ "step": 5024
30630
+ },
30631
+ {
30632
+ "epoch": 0.04,
30633
+ "learning_rate": 0.0004,
30634
+ "loss": 3.0071,
30635
+ "step": 5025
30636
+ },
30637
+ {
30638
+ "epoch": 0.04,
30639
+ "learning_rate": 0.0004,
30640
+ "loss": 7.351,
30641
+ "step": 5026
30642
+ },
30643
+ {
30644
+ "epoch": 0.04,
30645
+ "learning_rate": 0.0004,
30646
+ "loss": 2.824,
30647
+ "step": 5027
30648
+ },
30649
+ {
30650
+ "epoch": 0.04,
30651
+ "learning_rate": 0.0004,
30652
+ "loss": 7.5031,
30653
+ "step": 5028
30654
+ },
30655
+ {
30656
+ "epoch": 0.04,
30657
+ "learning_rate": 0.0004,
30658
+ "loss": 7.7362,
30659
+ "step": 5029
30660
+ },
30661
+ {
30662
+ "epoch": 0.04,
30663
+ "learning_rate": 0.0004,
30664
+ "loss": 5.5291,
30665
+ "step": 5030
30666
+ },
30667
+ {
30668
+ "epoch": 0.04,
30669
+ "learning_rate": 0.0004,
30670
+ "loss": 6.2754,
30671
+ "step": 5031
30672
+ },
30673
+ {
30674
+ "epoch": 0.04,
30675
+ "learning_rate": 0.0004,
30676
+ "loss": 3.1831,
30677
+ "step": 5032
30678
+ },
30679
+ {
30680
+ "epoch": 0.04,
30681
+ "learning_rate": 0.0004,
30682
+ "loss": 7.1838,
30683
+ "step": 5033
30684
+ },
30685
+ {
30686
+ "epoch": 0.04,
30687
+ "learning_rate": 0.0004,
30688
+ "loss": 5.2428,
30689
+ "step": 5034
30690
+ },
30691
+ {
30692
+ "epoch": 0.04,
30693
+ "learning_rate": 0.0004,
30694
+ "loss": 3.0482,
30695
+ "step": 5035
30696
+ },
30697
+ {
30698
+ "epoch": 0.04,
30699
+ "learning_rate": 0.0004,
30700
+ "loss": 5.2135,
30701
+ "step": 5036
30702
+ },
30703
+ {
30704
+ "epoch": 0.04,
30705
+ "learning_rate": 0.0004,
30706
+ "loss": 6.2257,
30707
+ "step": 5037
30708
+ },
30709
+ {
30710
+ "epoch": 0.04,
30711
+ "learning_rate": 0.0004,
30712
+ "loss": 6.7514,
30713
+ "step": 5038
30714
+ },
30715
+ {
30716
+ "epoch": 0.04,
30717
+ "learning_rate": 0.0004,
30718
+ "loss": 5.9855,
30719
+ "step": 5039
30720
+ },
30721
+ {
30722
+ "epoch": 0.04,
30723
+ "learning_rate": 0.0004,
30724
+ "loss": 2.9738,
30725
+ "step": 5040
30726
+ },
30727
+ {
30728
+ "epoch": 0.04,
30729
+ "learning_rate": 0.0004,
30730
+ "loss": 3.3993,
30731
+ "step": 5041
30732
+ },
30733
+ {
30734
+ "epoch": 0.04,
30735
+ "learning_rate": 0.0004,
30736
+ "loss": 8.35,
30737
+ "step": 5042
30738
+ },
30739
+ {
30740
+ "epoch": 0.04,
30741
+ "learning_rate": 0.0004,
30742
+ "loss": 7.1362,
30743
+ "step": 5043
30744
+ },
30745
+ {
30746
+ "epoch": 0.04,
30747
+ "learning_rate": 0.0004,
30748
+ "loss": 3.2576,
30749
+ "step": 5044
30750
+ },
30751
+ {
30752
+ "epoch": 0.04,
30753
+ "learning_rate": 0.0004,
30754
+ "loss": 4.0811,
30755
+ "step": 5045
30756
+ },
30757
+ {
30758
+ "epoch": 0.04,
30759
+ "learning_rate": 0.0004,
30760
+ "loss": 5.1524,
30761
+ "step": 5046
30762
+ },
30763
+ {
30764
+ "epoch": 0.04,
30765
+ "learning_rate": 0.0004,
30766
+ "loss": 3.3487,
30767
+ "step": 5047
30768
+ },
30769
+ {
30770
+ "epoch": 0.04,
30771
+ "learning_rate": 0.0004,
30772
+ "loss": 3.6397,
30773
+ "step": 5048
30774
+ },
30775
+ {
30776
+ "epoch": 0.04,
30777
+ "learning_rate": 0.0004,
30778
+ "loss": 3.5974,
30779
+ "step": 5049
30780
+ },
30781
+ {
30782
+ "epoch": 0.04,
30783
+ "learning_rate": 0.0004,
30784
+ "loss": 3.3061,
30785
+ "step": 5050
30786
+ },
30787
+ {
30788
+ "epoch": 0.04,
30789
+ "learning_rate": 0.0004,
30790
+ "loss": 8.1425,
30791
+ "step": 5051
30792
+ },
30793
+ {
30794
+ "epoch": 0.04,
30795
+ "learning_rate": 0.0004,
30796
+ "loss": 7.7089,
30797
+ "step": 5052
30798
+ },
30799
+ {
30800
+ "epoch": 0.04,
30801
+ "learning_rate": 0.0004,
30802
+ "loss": 7.2659,
30803
+ "step": 5053
30804
+ },
30805
+ {
30806
+ "epoch": 0.04,
30807
+ "learning_rate": 0.0004,
30808
+ "loss": 8.8699,
30809
+ "step": 5054
30810
+ },
30811
+ {
30812
+ "epoch": 0.04,
30813
+ "learning_rate": 0.0004,
30814
+ "loss": 6.561,
30815
+ "step": 5055
30816
+ },
30817
+ {
30818
+ "epoch": 0.04,
30819
+ "learning_rate": 0.0004,
30820
+ "loss": 8.8095,
30821
+ "step": 5056
30822
+ },
30823
+ {
30824
+ "epoch": 0.04,
30825
+ "learning_rate": 0.0004,
30826
+ "loss": 8.5513,
30827
+ "step": 5057
30828
+ },
30829
+ {
30830
+ "epoch": 0.04,
30831
+ "learning_rate": 0.0004,
30832
+ "loss": 5.8696,
30833
+ "step": 5058
30834
+ },
30835
+ {
30836
+ "epoch": 0.04,
30837
+ "learning_rate": 0.0004,
30838
+ "loss": 7.397,
30839
+ "step": 5059
30840
+ },
30841
+ {
30842
+ "epoch": 0.04,
30843
+ "learning_rate": 0.0004,
30844
+ "loss": 7.3762,
30845
+ "step": 5060
30846
+ },
30847
+ {
30848
+ "epoch": 0.04,
30849
+ "learning_rate": 0.0004,
30850
+ "loss": 7.1008,
30851
+ "step": 5061
30852
+ },
30853
+ {
30854
+ "epoch": 0.04,
30855
+ "learning_rate": 0.0004,
30856
+ "loss": 6.4717,
30857
+ "step": 5062
30858
+ },
30859
+ {
30860
+ "epoch": 0.04,
30861
+ "learning_rate": 0.0004,
30862
+ "loss": 6.8093,
30863
+ "step": 5063
30864
+ },
30865
+ {
30866
+ "epoch": 0.04,
30867
+ "learning_rate": 0.0004,
30868
+ "loss": 3.9448,
30869
+ "step": 5064
30870
+ },
30871
+ {
30872
+ "epoch": 0.04,
30873
+ "learning_rate": 0.0004,
30874
+ "loss": 3.4344,
30875
+ "step": 5065
30876
+ },
30877
+ {
30878
+ "epoch": 0.04,
30879
+ "learning_rate": 0.0004,
30880
+ "loss": 7.6244,
30881
+ "step": 5066
30882
+ },
30883
+ {
30884
+ "epoch": 0.04,
30885
+ "learning_rate": 0.0004,
30886
+ "loss": 3.1716,
30887
+ "step": 5067
30888
+ },
30889
+ {
30890
+ "epoch": 0.04,
30891
+ "learning_rate": 0.0004,
30892
+ "loss": 3.664,
30893
+ "step": 5068
30894
+ },
30895
+ {
30896
+ "epoch": 0.04,
30897
+ "learning_rate": 0.0004,
30898
+ "loss": 6.798,
30899
+ "step": 5069
30900
+ },
30901
+ {
30902
+ "epoch": 0.04,
30903
+ "learning_rate": 0.0004,
30904
+ "loss": 7.5492,
30905
+ "step": 5070
30906
+ },
30907
+ {
30908
+ "epoch": 0.04,
30909
+ "learning_rate": 0.0004,
30910
+ "loss": 6.4213,
30911
+ "step": 5071
30912
+ },
30913
+ {
30914
+ "epoch": 0.04,
30915
+ "learning_rate": 0.0004,
30916
+ "loss": 7.4969,
30917
+ "step": 5072
30918
+ },
30919
+ {
30920
+ "epoch": 0.04,
30921
+ "learning_rate": 0.0004,
30922
+ "loss": 4.6795,
30923
+ "step": 5073
30924
+ },
30925
+ {
30926
+ "epoch": 0.04,
30927
+ "learning_rate": 0.0004,
30928
+ "loss": 3.4509,
30929
+ "step": 5074
30930
+ },
30931
+ {
30932
+ "epoch": 0.04,
30933
+ "learning_rate": 0.0004,
30934
+ "loss": 3.7005,
30935
+ "step": 5075
30936
+ },
30937
+ {
30938
+ "epoch": 0.04,
30939
+ "learning_rate": 0.0004,
30940
+ "loss": 3.0754,
30941
+ "step": 5076
30942
+ },
30943
+ {
30944
+ "epoch": 0.04,
30945
+ "learning_rate": 0.0004,
30946
+ "loss": 3.0044,
30947
+ "step": 5077
30948
+ },
30949
+ {
30950
+ "epoch": 0.04,
30951
+ "learning_rate": 0.0004,
30952
+ "loss": 5.8357,
30953
+ "step": 5078
30954
+ },
30955
+ {
30956
+ "epoch": 0.04,
30957
+ "learning_rate": 0.0004,
30958
+ "loss": 6.0447,
30959
+ "step": 5079
30960
+ },
30961
+ {
30962
+ "epoch": 0.04,
30963
+ "learning_rate": 0.0004,
30964
+ "loss": 3.0046,
30965
+ "step": 5080
30966
+ },
30967
+ {
30968
+ "epoch": 0.04,
30969
+ "learning_rate": 0.0004,
30970
+ "loss": 7.3184,
30971
+ "step": 5081
30972
+ },
30973
+ {
30974
+ "epoch": 0.04,
30975
+ "learning_rate": 0.0004,
30976
+ "loss": 9.1669,
30977
+ "step": 5082
30978
+ },
30979
+ {
30980
+ "epoch": 0.04,
30981
+ "learning_rate": 0.0004,
30982
+ "loss": 2.7666,
30983
+ "step": 5083
30984
+ },
30985
+ {
30986
+ "epoch": 0.04,
30987
+ "learning_rate": 0.0004,
30988
+ "loss": 3.6633,
30989
+ "step": 5084
30990
+ },
30991
+ {
30992
+ "epoch": 0.04,
30993
+ "learning_rate": 0.0004,
30994
+ "loss": 2.5884,
30995
+ "step": 5085
30996
+ },
30997
+ {
30998
+ "epoch": 0.04,
30999
+ "learning_rate": 0.0004,
31000
+ "loss": 7.572,
31001
+ "step": 5086
31002
+ },
31003
+ {
31004
+ "epoch": 0.04,
31005
+ "learning_rate": 0.0004,
31006
+ "loss": 3.7835,
31007
+ "step": 5087
31008
+ },
31009
+ {
31010
+ "epoch": 0.04,
31011
+ "learning_rate": 0.0004,
31012
+ "loss": 9.58,
31013
+ "step": 5088
31014
+ },
31015
+ {
31016
+ "epoch": 0.04,
31017
+ "learning_rate": 0.0004,
31018
+ "loss": 1.8841,
31019
+ "step": 5089
31020
+ },
31021
+ {
31022
+ "epoch": 0.04,
31023
+ "learning_rate": 0.0004,
31024
+ "loss": 1.8848,
31025
+ "step": 5090
31026
+ },
31027
+ {
31028
+ "epoch": 0.04,
31029
+ "learning_rate": 0.0004,
31030
+ "loss": 2.2856,
31031
+ "step": 5091
31032
+ },
31033
+ {
31034
+ "epoch": 0.04,
31035
+ "learning_rate": 0.0004,
31036
+ "loss": 5.4633,
31037
+ "step": 5092
31038
+ },
31039
+ {
31040
+ "epoch": 0.04,
31041
+ "learning_rate": 0.0004,
31042
+ "loss": 6.082,
31043
+ "step": 5093
31044
+ },
31045
+ {
31046
+ "epoch": 0.04,
31047
+ "learning_rate": 0.0004,
31048
+ "loss": 6.8496,
31049
+ "step": 5094
31050
+ },
31051
+ {
31052
+ "epoch": 0.04,
31053
+ "learning_rate": 0.0004,
31054
+ "loss": 6.2212,
31055
+ "step": 5095
31056
+ },
31057
+ {
31058
+ "epoch": 0.04,
31059
+ "learning_rate": 0.0004,
31060
+ "loss": 6.5972,
31061
+ "step": 5096
31062
+ },
31063
+ {
31064
+ "epoch": 0.04,
31065
+ "learning_rate": 0.0004,
31066
+ "loss": 6.9355,
31067
+ "step": 5097
31068
+ },
31069
+ {
31070
+ "epoch": 0.04,
31071
+ "learning_rate": 0.0004,
31072
+ "loss": 7.5645,
31073
+ "step": 5098
31074
+ },
31075
+ {
31076
+ "epoch": 0.04,
31077
+ "learning_rate": 0.0004,
31078
+ "loss": 5.944,
31079
+ "step": 5099
31080
+ },
31081
+ {
31082
+ "epoch": 0.04,
31083
+ "learning_rate": 0.0004,
31084
+ "loss": 1.7678,
31085
+ "step": 5100
31086
+ },
31087
+ {
31088
+ "epoch": 0.04,
31089
+ "learning_rate": 0.0004,
31090
+ "loss": 2.294,
31091
+ "step": 5101
31092
+ },
31093
+ {
31094
+ "epoch": 0.04,
31095
+ "learning_rate": 0.0004,
31096
+ "loss": 8.9452,
31097
+ "step": 5102
31098
+ },
31099
+ {
31100
+ "epoch": 0.04,
31101
+ "learning_rate": 0.0004,
31102
+ "loss": 7.2751,
31103
+ "step": 5103
31104
+ },
31105
+ {
31106
+ "epoch": 0.04,
31107
+ "learning_rate": 0.0004,
31108
+ "loss": 7.2882,
31109
+ "step": 5104
31110
+ },
31111
+ {
31112
+ "epoch": 0.04,
31113
+ "learning_rate": 0.0004,
31114
+ "loss": 8.3101,
31115
+ "step": 5105
31116
+ },
31117
+ {
31118
+ "epoch": 0.04,
31119
+ "learning_rate": 0.0004,
31120
+ "loss": 8.1729,
31121
+ "step": 5106
31122
+ },
31123
+ {
31124
+ "epoch": 0.04,
31125
+ "learning_rate": 0.0004,
31126
+ "loss": 4.7164,
31127
+ "step": 5107
31128
+ },
31129
+ {
31130
+ "epoch": 0.04,
31131
+ "learning_rate": 0.0004,
31132
+ "loss": 6.9774,
31133
+ "step": 5108
31134
+ },
31135
+ {
31136
+ "epoch": 0.04,
31137
+ "learning_rate": 0.0004,
31138
+ "loss": 8.5206,
31139
+ "step": 5109
31140
+ },
31141
+ {
31142
+ "epoch": 0.04,
31143
+ "learning_rate": 0.0004,
31144
+ "loss": 7.961,
31145
+ "step": 5110
31146
+ },
31147
+ {
31148
+ "epoch": 0.04,
31149
+ "learning_rate": 0.0004,
31150
+ "loss": 2.5247,
31151
+ "step": 5111
31152
+ },
31153
+ {
31154
+ "epoch": 0.04,
31155
+ "learning_rate": 0.0004,
31156
+ "loss": 6.9292,
31157
+ "step": 5112
31158
+ },
31159
+ {
31160
+ "epoch": 0.04,
31161
+ "learning_rate": 0.0004,
31162
+ "loss": 7.3455,
31163
+ "step": 5113
31164
+ },
31165
+ {
31166
+ "epoch": 0.04,
31167
+ "learning_rate": 0.0004,
31168
+ "loss": 8.483,
31169
+ "step": 5114
31170
+ },
31171
+ {
31172
+ "epoch": 0.04,
31173
+ "learning_rate": 0.0004,
31174
+ "loss": 6.5356,
31175
+ "step": 5115
31176
+ },
31177
+ {
31178
+ "epoch": 0.04,
31179
+ "learning_rate": 0.0004,
31180
+ "loss": 8.7652,
31181
+ "step": 5116
31182
+ },
31183
+ {
31184
+ "epoch": 0.04,
31185
+ "learning_rate": 0.0004,
31186
+ "loss": 7.6761,
31187
+ "step": 5117
31188
+ },
31189
+ {
31190
+ "epoch": 0.04,
31191
+ "learning_rate": 0.0004,
31192
+ "loss": 3.3517,
31193
+ "step": 5118
31194
+ },
31195
+ {
31196
+ "epoch": 0.04,
31197
+ "learning_rate": 0.0004,
31198
+ "loss": 3.6645,
31199
+ "step": 5119
31200
+ },
31201
+ {
31202
+ "epoch": 0.04,
31203
+ "learning_rate": 0.0004,
31204
+ "loss": 8.1988,
31205
+ "step": 5120
31206
+ },
31207
+ {
31208
+ "epoch": 0.04,
31209
+ "learning_rate": 0.0004,
31210
+ "loss": 7.292,
31211
+ "step": 5121
31212
+ },
31213
+ {
31214
+ "epoch": 0.04,
31215
+ "learning_rate": 0.0004,
31216
+ "loss": 7.7002,
31217
+ "step": 5122
31218
+ },
31219
+ {
31220
+ "epoch": 0.04,
31221
+ "learning_rate": 0.0004,
31222
+ "loss": 5.1236,
31223
+ "step": 5123
31224
+ },
31225
+ {
31226
+ "epoch": 0.04,
31227
+ "learning_rate": 0.0004,
31228
+ "loss": 4.2389,
31229
+ "step": 5124
31230
+ },
31231
+ {
31232
+ "epoch": 0.04,
31233
+ "learning_rate": 0.0004,
31234
+ "loss": 5.5483,
31235
+ "step": 5125
31236
+ },
31237
+ {
31238
+ "epoch": 0.04,
31239
+ "learning_rate": 0.0004,
31240
+ "loss": 6.6833,
31241
+ "step": 5126
31242
+ },
31243
+ {
31244
+ "epoch": 0.04,
31245
+ "learning_rate": 0.0004,
31246
+ "loss": 7.1315,
31247
+ "step": 5127
31248
+ },
31249
+ {
31250
+ "epoch": 0.04,
31251
+ "learning_rate": 0.0004,
31252
+ "loss": 3.8323,
31253
+ "step": 5128
31254
+ },
31255
+ {
31256
+ "epoch": 0.04,
31257
+ "learning_rate": 0.0004,
31258
+ "loss": 2.7692,
31259
+ "step": 5129
31260
+ },
31261
+ {
31262
+ "epoch": 0.04,
31263
+ "learning_rate": 0.0004,
31264
+ "loss": 6.637,
31265
+ "step": 5130
31266
+ },
31267
+ {
31268
+ "epoch": 0.04,
31269
+ "learning_rate": 0.0004,
31270
+ "loss": 4.0574,
31271
+ "step": 5131
31272
+ },
31273
+ {
31274
+ "epoch": 0.04,
31275
+ "learning_rate": 0.0004,
31276
+ "loss": 3.1885,
31277
+ "step": 5132
31278
+ },
31279
+ {
31280
+ "epoch": 0.04,
31281
+ "learning_rate": 0.0004,
31282
+ "loss": 3.2203,
31283
+ "step": 5133
31284
+ },
31285
+ {
31286
+ "epoch": 0.04,
31287
+ "learning_rate": 0.0004,
31288
+ "loss": 3.0885,
31289
+ "step": 5134
31290
+ },
31291
+ {
31292
+ "epoch": 0.04,
31293
+ "learning_rate": 0.0004,
31294
+ "loss": 8.2705,
31295
+ "step": 5135
31296
+ },
31297
+ {
31298
+ "epoch": 0.04,
31299
+ "learning_rate": 0.0004,
31300
+ "loss": 7.3963,
31301
+ "step": 5136
31302
+ },
31303
+ {
31304
+ "epoch": 0.04,
31305
+ "learning_rate": 0.0004,
31306
+ "loss": 6.9949,
31307
+ "step": 5137
31308
+ },
31309
+ {
31310
+ "epoch": 0.04,
31311
+ "learning_rate": 0.0004,
31312
+ "loss": 5.7339,
31313
+ "step": 5138
31314
+ },
31315
+ {
31316
+ "epoch": 0.04,
31317
+ "learning_rate": 0.0004,
31318
+ "loss": 5.041,
31319
+ "step": 5139
31320
+ },
31321
+ {
31322
+ "epoch": 0.04,
31323
+ "learning_rate": 0.0004,
31324
+ "loss": 7.3292,
31325
+ "step": 5140
31326
+ },
31327
+ {
31328
+ "epoch": 0.04,
31329
+ "learning_rate": 0.0004,
31330
+ "loss": 2.7841,
31331
+ "step": 5141
31332
+ },
31333
+ {
31334
+ "epoch": 0.04,
31335
+ "learning_rate": 0.0004,
31336
+ "loss": 2.5847,
31337
+ "step": 5142
31338
+ },
31339
+ {
31340
+ "epoch": 0.04,
31341
+ "learning_rate": 0.0004,
31342
+ "loss": 3.3698,
31343
+ "step": 5143
31344
+ },
31345
+ {
31346
+ "epoch": 0.04,
31347
+ "learning_rate": 0.0004,
31348
+ "loss": 6.2052,
31349
+ "step": 5144
31350
+ },
31351
+ {
31352
+ "epoch": 0.04,
31353
+ "learning_rate": 0.0004,
31354
+ "loss": 4.8951,
31355
+ "step": 5145
31356
+ },
31357
+ {
31358
+ "epoch": 0.04,
31359
+ "learning_rate": 0.0004,
31360
+ "loss": 5.9602,
31361
+ "step": 5146
31362
+ },
31363
+ {
31364
+ "epoch": 0.04,
31365
+ "learning_rate": 0.0004,
31366
+ "loss": 2.2329,
31367
+ "step": 5147
31368
+ },
31369
+ {
31370
+ "epoch": 0.04,
31371
+ "learning_rate": 0.0004,
31372
+ "loss": 8.0463,
31373
+ "step": 5148
31374
+ },
31375
+ {
31376
+ "epoch": 0.04,
31377
+ "learning_rate": 0.0004,
31378
+ "loss": 4.4032,
31379
+ "step": 5149
31380
+ },
31381
+ {
31382
+ "epoch": 0.04,
31383
+ "learning_rate": 0.0004,
31384
+ "loss": 3.3965,
31385
+ "step": 5150
31386
+ },
31387
+ {
31388
+ "epoch": 0.04,
31389
+ "learning_rate": 0.0004,
31390
+ "loss": 8.4988,
31391
+ "step": 5151
31392
+ },
31393
+ {
31394
+ "epoch": 0.04,
31395
+ "learning_rate": 0.0004,
31396
+ "loss": 8.2164,
31397
+ "step": 5152
31398
+ },
31399
+ {
31400
+ "epoch": 0.04,
31401
+ "learning_rate": 0.0004,
31402
+ "loss": 7.1181,
31403
+ "step": 5153
31404
+ },
31405
+ {
31406
+ "epoch": 0.04,
31407
+ "learning_rate": 0.0004,
31408
+ "loss": 7.6035,
31409
+ "step": 5154
31410
+ },
31411
+ {
31412
+ "epoch": 0.04,
31413
+ "learning_rate": 0.0004,
31414
+ "loss": 4.8853,
31415
+ "step": 5155
31416
+ },
31417
+ {
31418
+ "epoch": 0.04,
31419
+ "learning_rate": 0.0004,
31420
+ "loss": 8.9761,
31421
+ "step": 5156
31422
+ },
31423
+ {
31424
+ "epoch": 0.04,
31425
+ "learning_rate": 0.0004,
31426
+ "loss": 7.6907,
31427
+ "step": 5157
31428
+ },
31429
+ {
31430
+ "epoch": 0.04,
31431
+ "learning_rate": 0.0004,
31432
+ "loss": 4.5017,
31433
+ "step": 5158
31434
+ },
31435
+ {
31436
+ "epoch": 0.04,
31437
+ "learning_rate": 0.0004,
31438
+ "loss": 4.3164,
31439
+ "step": 5159
31440
+ },
31441
+ {
31442
+ "epoch": 0.04,
31443
+ "learning_rate": 0.0004,
31444
+ "loss": 6.7913,
31445
+ "step": 5160
31446
+ },
31447
+ {
31448
+ "epoch": 0.04,
31449
+ "learning_rate": 0.0004,
31450
+ "loss": 8.4106,
31451
+ "step": 5161
31452
+ },
31453
+ {
31454
+ "epoch": 0.04,
31455
+ "learning_rate": 0.0004,
31456
+ "loss": 6.4332,
31457
+ "step": 5162
31458
+ },
31459
+ {
31460
+ "epoch": 0.04,
31461
+ "learning_rate": 0.0004,
31462
+ "loss": 7.8098,
31463
+ "step": 5163
31464
+ },
31465
+ {
31466
+ "epoch": 0.04,
31467
+ "learning_rate": 0.0004,
31468
+ "loss": 6.2833,
31469
+ "step": 5164
31470
+ },
31471
+ {
31472
+ "epoch": 0.04,
31473
+ "learning_rate": 0.0004,
31474
+ "loss": 7.7168,
31475
+ "step": 5165
31476
+ },
31477
+ {
31478
+ "epoch": 0.04,
31479
+ "learning_rate": 0.0004,
31480
+ "loss": 8.851,
31481
+ "step": 5166
31482
+ },
31483
+ {
31484
+ "epoch": 0.04,
31485
+ "learning_rate": 0.0004,
31486
+ "loss": 4.8072,
31487
+ "step": 5167
31488
+ },
31489
+ {
31490
+ "epoch": 0.04,
31491
+ "learning_rate": 0.0004,
31492
+ "loss": 6.9745,
31493
+ "step": 5168
31494
+ },
31495
+ {
31496
+ "epoch": 0.04,
31497
+ "learning_rate": 0.0004,
31498
+ "loss": 6.5735,
31499
+ "step": 5169
31500
+ },
31501
+ {
31502
+ "epoch": 0.04,
31503
+ "learning_rate": 0.0004,
31504
+ "loss": 7.771,
31505
+ "step": 5170
31506
+ },
31507
+ {
31508
+ "epoch": 0.04,
31509
+ "learning_rate": 0.0004,
31510
+ "loss": 6.9335,
31511
+ "step": 5171
31512
+ },
31513
+ {
31514
+ "epoch": 0.04,
31515
+ "learning_rate": 0.0004,
31516
+ "loss": 6.856,
31517
+ "step": 5172
31518
+ },
31519
+ {
31520
+ "epoch": 0.04,
31521
+ "learning_rate": 0.0004,
31522
+ "loss": 6.4643,
31523
+ "step": 5173
31524
+ },
31525
+ {
31526
+ "epoch": 0.04,
31527
+ "learning_rate": 0.0004,
31528
+ "loss": 7.2565,
31529
+ "step": 5174
31530
+ },
31531
+ {
31532
+ "epoch": 0.04,
31533
+ "learning_rate": 0.0004,
31534
+ "loss": 8.0499,
31535
+ "step": 5175
31536
+ },
31537
+ {
31538
+ "epoch": 0.04,
31539
+ "learning_rate": 0.0004,
31540
+ "loss": 4.8558,
31541
+ "step": 5176
31542
+ },
31543
+ {
31544
+ "epoch": 0.04,
31545
+ "learning_rate": 0.0004,
31546
+ "loss": 3.99,
31547
+ "step": 5177
31548
+ },
31549
+ {
31550
+ "epoch": 0.04,
31551
+ "learning_rate": 0.0004,
31552
+ "loss": 3.7183,
31553
+ "step": 5178
31554
+ },
31555
+ {
31556
+ "epoch": 0.04,
31557
+ "learning_rate": 0.0004,
31558
+ "loss": 5.2353,
31559
+ "step": 5179
31560
+ },
31561
+ {
31562
+ "epoch": 0.04,
31563
+ "learning_rate": 0.0004,
31564
+ "loss": 8.1037,
31565
+ "step": 5180
31566
+ },
31567
+ {
31568
+ "epoch": 0.04,
31569
+ "learning_rate": 0.0004,
31570
+ "loss": 4.4882,
31571
+ "step": 5181
31572
+ },
31573
+ {
31574
+ "epoch": 0.04,
31575
+ "learning_rate": 0.0004,
31576
+ "loss": 6.2169,
31577
+ "step": 5182
31578
+ },
31579
+ {
31580
+ "epoch": 0.04,
31581
+ "learning_rate": 0.0004,
31582
+ "loss": 7.01,
31583
+ "step": 5183
31584
+ },
31585
+ {
31586
+ "epoch": 0.04,
31587
+ "learning_rate": 0.0004,
31588
+ "loss": 8.2869,
31589
+ "step": 5184
31590
+ },
31591
+ {
31592
+ "epoch": 0.04,
31593
+ "learning_rate": 0.0004,
31594
+ "loss": 3.5804,
31595
+ "step": 5185
31596
+ },
31597
+ {
31598
+ "epoch": 0.04,
31599
+ "learning_rate": 0.0004,
31600
+ "loss": 5.3033,
31601
+ "step": 5186
31602
+ },
31603
+ {
31604
+ "epoch": 0.04,
31605
+ "learning_rate": 0.0004,
31606
+ "loss": 4.1612,
31607
+ "step": 5187
31608
+ },
31609
+ {
31610
+ "epoch": 0.04,
31611
+ "learning_rate": 0.0004,
31612
+ "loss": 6.9619,
31613
+ "step": 5188
31614
+ },
31615
+ {
31616
+ "epoch": 0.04,
31617
+ "learning_rate": 0.0004,
31618
+ "loss": 5.3567,
31619
+ "step": 5189
31620
+ },
31621
+ {
31622
+ "epoch": 0.04,
31623
+ "learning_rate": 0.0004,
31624
+ "loss": 3.2493,
31625
+ "step": 5190
31626
+ },
31627
+ {
31628
+ "epoch": 0.04,
31629
+ "learning_rate": 0.0004,
31630
+ "loss": 7.3546,
31631
+ "step": 5191
31632
+ },
31633
+ {
31634
+ "epoch": 0.04,
31635
+ "learning_rate": 0.0004,
31636
+ "loss": 6.3477,
31637
+ "step": 5192
31638
+ },
31639
+ {
31640
+ "epoch": 0.04,
31641
+ "learning_rate": 0.0004,
31642
+ "loss": 6.7189,
31643
+ "step": 5193
31644
+ },
31645
+ {
31646
+ "epoch": 0.04,
31647
+ "learning_rate": 0.0004,
31648
+ "loss": 3.956,
31649
+ "step": 5194
31650
+ },
31651
+ {
31652
+ "epoch": 0.04,
31653
+ "learning_rate": 0.0004,
31654
+ "loss": 5.3166,
31655
+ "step": 5195
31656
+ },
31657
+ {
31658
+ "epoch": 0.04,
31659
+ "learning_rate": 0.0004,
31660
+ "loss": 6.0115,
31661
+ "step": 5196
31662
+ },
31663
+ {
31664
+ "epoch": 0.04,
31665
+ "learning_rate": 0.0004,
31666
+ "loss": 3.3418,
31667
+ "step": 5197
31668
+ },
31669
+ {
31670
+ "epoch": 0.04,
31671
+ "learning_rate": 0.0004,
31672
+ "loss": 3.1107,
31673
+ "step": 5198
31674
+ },
31675
+ {
31676
+ "epoch": 0.04,
31677
+ "learning_rate": 0.0004,
31678
+ "loss": 6.1123,
31679
+ "step": 5199
31680
+ },
31681
+ {
31682
+ "epoch": 0.04,
31683
+ "learning_rate": 0.0004,
31684
+ "loss": 3.6152,
31685
+ "step": 5200
31686
+ },
31687
+ {
31688
+ "epoch": 0.04,
31689
+ "eval_loss": 6.378727912902832,
31690
+ "eval_runtime": 22.4659,
31691
+ "eval_samples_per_second": 2.226,
31692
+ "eval_steps_per_second": 1.113,
31693
+ "step": 5200
31694
+ },
31695
+ {
31696
+ "epoch": 0.04,
31697
+ "mmlu_eval_accuracy": 0.32602813852813856,
31698
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
31699
+ "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
31700
+ "mmlu_eval_accuracy_astronomy": 0.25,
31701
+ "mmlu_eval_accuracy_business_ethics": 0.3333333333333333,
31702
+ "mmlu_loss": 3.6307257652282714,
31703
+ "step": 5200
31704
+ },
31705
+ {
31706
+ "epoch": 0.04,
31707
+ "learning_rate": 0.0004,
31708
+ "loss": 8.3892,
31709
+ "step": 5201
31710
+ },
31711
+ {
31712
+ "epoch": 0.04,
31713
+ "learning_rate": 0.0004,
31714
+ "loss": 8.2349,
31715
+ "step": 5202
31716
+ },
31717
+ {
31718
+ "epoch": 0.04,
31719
+ "learning_rate": 0.0004,
31720
+ "loss": 7.5333,
31721
+ "step": 5203
31722
+ },
31723
+ {
31724
+ "epoch": 0.04,
31725
+ "learning_rate": 0.0004,
31726
+ "loss": 6.145,
31727
+ "step": 5204
31728
+ },
31729
+ {
31730
+ "epoch": 0.04,
31731
+ "learning_rate": 0.0004,
31732
+ "loss": 6.4543,
31733
+ "step": 5205
31734
+ },
31735
+ {
31736
+ "epoch": 0.04,
31737
+ "learning_rate": 0.0004,
31738
+ "loss": 2.7512,
31739
+ "step": 5206
31740
+ },
31741
+ {
31742
+ "epoch": 0.04,
31743
+ "learning_rate": 0.0004,
31744
+ "loss": 8.44,
31745
+ "step": 5207
31746
+ },
31747
+ {
31748
+ "epoch": 0.04,
31749
+ "learning_rate": 0.0004,
31750
+ "loss": 3.7399,
31751
+ "step": 5208
31752
+ },
31753
+ {
31754
+ "epoch": 0.04,
31755
+ "learning_rate": 0.0004,
31756
+ "loss": 6.133,
31757
+ "step": 5209
31758
+ },
31759
+ {
31760
+ "epoch": 0.04,
31761
+ "learning_rate": 0.0004,
31762
+ "loss": 5.1594,
31763
+ "step": 5210
31764
+ },
31765
+ {
31766
+ "epoch": 0.04,
31767
+ "learning_rate": 0.0004,
31768
+ "loss": 2.6307,
31769
+ "step": 5211
31770
+ },
31771
+ {
31772
+ "epoch": 0.04,
31773
+ "learning_rate": 0.0004,
31774
+ "loss": 3.4796,
31775
+ "step": 5212
31776
+ },
31777
+ {
31778
+ "epoch": 0.04,
31779
+ "learning_rate": 0.0004,
31780
+ "loss": 4.2767,
31781
+ "step": 5213
31782
+ },
31783
+ {
31784
+ "epoch": 0.04,
31785
+ "learning_rate": 0.0004,
31786
+ "loss": 8.9843,
31787
+ "step": 5214
31788
+ },
31789
+ {
31790
+ "epoch": 0.04,
31791
+ "learning_rate": 0.0004,
31792
+ "loss": 7.0799,
31793
+ "step": 5215
31794
+ },
31795
+ {
31796
+ "epoch": 0.04,
31797
+ "learning_rate": 0.0004,
31798
+ "loss": 7.1612,
31799
+ "step": 5216
31800
+ },
31801
+ {
31802
+ "epoch": 0.04,
31803
+ "learning_rate": 0.0004,
31804
+ "loss": 3.2503,
31805
+ "step": 5217
31806
+ },
31807
+ {
31808
+ "epoch": 0.04,
31809
+ "learning_rate": 0.0004,
31810
+ "loss": 4.4808,
31811
+ "step": 5218
31812
+ },
31813
+ {
31814
+ "epoch": 0.04,
31815
+ "learning_rate": 0.0004,
31816
+ "loss": 3.0658,
31817
+ "step": 5219
31818
+ },
31819
+ {
31820
+ "epoch": 0.04,
31821
+ "learning_rate": 0.0004,
31822
+ "loss": 8.8345,
31823
+ "step": 5220
31824
+ },
31825
+ {
31826
+ "epoch": 0.04,
31827
+ "learning_rate": 0.0004,
31828
+ "loss": 2.2569,
31829
+ "step": 5221
31830
+ },
31831
+ {
31832
+ "epoch": 0.04,
31833
+ "learning_rate": 0.0004,
31834
+ "loss": 6.3815,
31835
+ "step": 5222
31836
+ },
31837
+ {
31838
+ "epoch": 0.04,
31839
+ "learning_rate": 0.0004,
31840
+ "loss": 3.381,
31841
+ "step": 5223
31842
+ },
31843
+ {
31844
+ "epoch": 0.04,
31845
+ "learning_rate": 0.0004,
31846
+ "loss": 7.2786,
31847
+ "step": 5224
31848
+ },
31849
+ {
31850
+ "epoch": 0.04,
31851
+ "learning_rate": 0.0004,
31852
+ "loss": 8.1709,
31853
+ "step": 5225
31854
+ },
31855
+ {
31856
+ "epoch": 0.04,
31857
+ "learning_rate": 0.0004,
31858
+ "loss": 2.5537,
31859
+ "step": 5226
31860
+ },
31861
+ {
31862
+ "epoch": 0.04,
31863
+ "learning_rate": 0.0004,
31864
+ "loss": 6.498,
31865
+ "step": 5227
31866
+ },
31867
+ {
31868
+ "epoch": 0.04,
31869
+ "learning_rate": 0.0004,
31870
+ "loss": 6.9914,
31871
+ "step": 5228
31872
+ },
31873
+ {
31874
+ "epoch": 0.04,
31875
+ "learning_rate": 0.0004,
31876
+ "loss": 3.6207,
31877
+ "step": 5229
31878
+ },
31879
+ {
31880
+ "epoch": 0.04,
31881
+ "learning_rate": 0.0004,
31882
+ "loss": 5.1154,
31883
+ "step": 5230
31884
+ },
31885
+ {
31886
+ "epoch": 0.04,
31887
+ "learning_rate": 0.0004,
31888
+ "loss": 7.7722,
31889
+ "step": 5231
31890
+ },
31891
+ {
31892
+ "epoch": 0.04,
31893
+ "learning_rate": 0.0004,
31894
+ "loss": 3.0188,
31895
+ "step": 5232
31896
+ },
31897
+ {
31898
+ "epoch": 0.04,
31899
+ "learning_rate": 0.0004,
31900
+ "loss": 2.133,
31901
+ "step": 5233
31902
+ },
31903
+ {
31904
+ "epoch": 0.04,
31905
+ "learning_rate": 0.0004,
31906
+ "loss": 3.7835,
31907
+ "step": 5234
31908
+ },
31909
+ {
31910
+ "epoch": 0.04,
31911
+ "learning_rate": 0.0004,
31912
+ "loss": 2.1327,
31913
+ "step": 5235
31914
+ },
31915
+ {
31916
+ "epoch": 0.04,
31917
+ "learning_rate": 0.0004,
31918
+ "loss": 6.9416,
31919
+ "step": 5236
31920
+ },
31921
+ {
31922
+ "epoch": 0.04,
31923
+ "learning_rate": 0.0004,
31924
+ "loss": 7.1057,
31925
+ "step": 5237
31926
+ },
31927
+ {
31928
+ "epoch": 0.04,
31929
+ "learning_rate": 0.0004,
31930
+ "loss": 3.5148,
31931
+ "step": 5238
31932
+ },
31933
+ {
31934
+ "epoch": 0.04,
31935
+ "learning_rate": 0.0004,
31936
+ "loss": 3.8436,
31937
+ "step": 5239
31938
+ },
31939
+ {
31940
+ "epoch": 0.04,
31941
+ "learning_rate": 0.0004,
31942
+ "loss": 4.763,
31943
+ "step": 5240
31944
+ },
31945
+ {
31946
+ "epoch": 0.04,
31947
+ "learning_rate": 0.0004,
31948
+ "loss": 4.7498,
31949
+ "step": 5241
31950
+ },
31951
+ {
31952
+ "epoch": 0.04,
31953
+ "learning_rate": 0.0004,
31954
+ "loss": 6.7862,
31955
+ "step": 5242
31956
+ },
31957
+ {
31958
+ "epoch": 0.04,
31959
+ "learning_rate": 0.0004,
31960
+ "loss": 6.9326,
31961
+ "step": 5243
31962
+ },
31963
+ {
31964
+ "epoch": 0.04,
31965
+ "learning_rate": 0.0004,
31966
+ "loss": 2.1277,
31967
+ "step": 5244
31968
+ },
31969
+ {
31970
+ "epoch": 0.04,
31971
+ "learning_rate": 0.0004,
31972
+ "loss": 6.5697,
31973
+ "step": 5245
31974
+ },
31975
+ {
31976
+ "epoch": 0.04,
31977
+ "learning_rate": 0.0004,
31978
+ "loss": 2.0789,
31979
+ "step": 5246
31980
+ },
31981
+ {
31982
+ "epoch": 0.04,
31983
+ "learning_rate": 0.0004,
31984
+ "loss": 2.6917,
31985
+ "step": 5247
31986
+ },
31987
+ {
31988
+ "epoch": 0.04,
31989
+ "learning_rate": 0.0004,
31990
+ "loss": 2.5857,
31991
+ "step": 5248
31992
+ },
31993
+ {
31994
+ "epoch": 0.04,
31995
+ "learning_rate": 0.0004,
31996
+ "loss": 2.5904,
31997
+ "step": 5249
31998
+ },
31999
+ {
32000
+ "epoch": 0.04,
32001
+ "learning_rate": 0.0004,
32002
+ "loss": 3.6245,
32003
+ "step": 5250
32004
+ },
32005
+ {
32006
+ "epoch": 0.04,
32007
+ "learning_rate": 0.0004,
32008
+ "loss": 6.5449,
32009
+ "step": 5251
32010
+ },
32011
+ {
32012
+ "epoch": 0.04,
32013
+ "learning_rate": 0.0004,
32014
+ "loss": 9.9082,
32015
+ "step": 5252
32016
+ },
32017
+ {
32018
+ "epoch": 0.04,
32019
+ "learning_rate": 0.0004,
32020
+ "loss": 7.9662,
32021
+ "step": 5253
32022
+ },
32023
+ {
32024
+ "epoch": 0.04,
32025
+ "learning_rate": 0.0004,
32026
+ "loss": 7.733,
32027
+ "step": 5254
32028
+ },
32029
+ {
32030
+ "epoch": 0.04,
32031
+ "learning_rate": 0.0004,
32032
+ "loss": 7.1067,
32033
+ "step": 5255
32034
+ },
32035
+ {
32036
+ "epoch": 0.04,
32037
+ "learning_rate": 0.0004,
32038
+ "loss": 5.9188,
32039
+ "step": 5256
32040
+ },
32041
+ {
32042
+ "epoch": 0.04,
32043
+ "learning_rate": 0.0004,
32044
+ "loss": 4.5555,
32045
+ "step": 5257
32046
+ },
32047
+ {
32048
+ "epoch": 0.04,
32049
+ "learning_rate": 0.0004,
32050
+ "loss": 6.7376,
32051
+ "step": 5258
32052
+ },
32053
+ {
32054
+ "epoch": 0.04,
32055
+ "learning_rate": 0.0004,
32056
+ "loss": 9.3653,
32057
+ "step": 5259
32058
+ },
32059
+ {
32060
+ "epoch": 0.04,
32061
+ "learning_rate": 0.0004,
32062
+ "loss": 5.7456,
32063
+ "step": 5260
32064
+ },
32065
+ {
32066
+ "epoch": 0.04,
32067
+ "learning_rate": 0.0004,
32068
+ "loss": 6.1382,
32069
+ "step": 5261
32070
+ },
32071
+ {
32072
+ "epoch": 0.04,
32073
+ "learning_rate": 0.0004,
32074
+ "loss": 2.472,
32075
+ "step": 5262
32076
+ },
32077
+ {
32078
+ "epoch": 0.04,
32079
+ "learning_rate": 0.0004,
32080
+ "loss": 7.0488,
32081
+ "step": 5263
32082
+ },
32083
+ {
32084
+ "epoch": 0.04,
32085
+ "learning_rate": 0.0004,
32086
+ "loss": 7.4769,
32087
+ "step": 5264
32088
+ },
32089
+ {
32090
+ "epoch": 0.04,
32091
+ "learning_rate": 0.0004,
32092
+ "loss": 4.5465,
32093
+ "step": 5265
32094
+ },
32095
+ {
32096
+ "epoch": 0.04,
32097
+ "learning_rate": 0.0004,
32098
+ "loss": 2.8687,
32099
+ "step": 5266
32100
+ },
32101
+ {
32102
+ "epoch": 0.04,
32103
+ "learning_rate": 0.0004,
32104
+ "loss": 6.535,
32105
+ "step": 5267
32106
+ },
32107
+ {
32108
+ "epoch": 0.04,
32109
+ "learning_rate": 0.0004,
32110
+ "loss": 3.935,
32111
+ "step": 5268
32112
+ },
32113
+ {
32114
+ "epoch": 0.04,
32115
+ "learning_rate": 0.0004,
32116
+ "loss": 6.2081,
32117
+ "step": 5269
32118
+ },
32119
+ {
32120
+ "epoch": 0.04,
32121
+ "learning_rate": 0.0004,
32122
+ "loss": 3.5528,
32123
+ "step": 5270
32124
+ },
32125
+ {
32126
+ "epoch": 0.04,
32127
+ "learning_rate": 0.0004,
32128
+ "loss": 5.2201,
32129
+ "step": 5271
32130
+ },
32131
+ {
32132
+ "epoch": 0.04,
32133
+ "learning_rate": 0.0004,
32134
+ "loss": 6.3348,
32135
+ "step": 5272
32136
+ },
32137
+ {
32138
+ "epoch": 0.04,
32139
+ "learning_rate": 0.0004,
32140
+ "loss": 6.8958,
32141
+ "step": 5273
32142
+ },
32143
+ {
32144
+ "epoch": 0.04,
32145
+ "learning_rate": 0.0004,
32146
+ "loss": 6.2687,
32147
+ "step": 5274
32148
+ },
32149
+ {
32150
+ "epoch": 0.04,
32151
+ "learning_rate": 0.0004,
32152
+ "loss": 4.2481,
32153
+ "step": 5275
32154
+ },
32155
+ {
32156
+ "epoch": 0.04,
32157
+ "learning_rate": 0.0004,
32158
+ "loss": 3.1491,
32159
+ "step": 5276
32160
+ },
32161
+ {
32162
+ "epoch": 0.04,
32163
+ "learning_rate": 0.0004,
32164
+ "loss": 2.9855,
32165
+ "step": 5277
32166
+ },
32167
+ {
32168
+ "epoch": 0.04,
32169
+ "learning_rate": 0.0004,
32170
+ "loss": 6.7815,
32171
+ "step": 5278
32172
+ },
32173
+ {
32174
+ "epoch": 0.04,
32175
+ "learning_rate": 0.0004,
32176
+ "loss": 8.0858,
32177
+ "step": 5279
32178
+ },
32179
+ {
32180
+ "epoch": 0.04,
32181
+ "learning_rate": 0.0004,
32182
+ "loss": 5.8508,
32183
+ "step": 5280
32184
+ },
32185
+ {
32186
+ "epoch": 0.04,
32187
+ "learning_rate": 0.0004,
32188
+ "loss": 6.1981,
32189
+ "step": 5281
32190
+ },
32191
+ {
32192
+ "epoch": 0.04,
32193
+ "learning_rate": 0.0004,
32194
+ "loss": 6.0052,
32195
+ "step": 5282
32196
+ },
32197
+ {
32198
+ "epoch": 0.04,
32199
+ "learning_rate": 0.0004,
32200
+ "loss": 6.6153,
32201
+ "step": 5283
32202
+ },
32203
+ {
32204
+ "epoch": 0.04,
32205
+ "learning_rate": 0.0004,
32206
+ "loss": 3.3998,
32207
+ "step": 5284
32208
+ },
32209
+ {
32210
+ "epoch": 0.04,
32211
+ "learning_rate": 0.0004,
32212
+ "loss": 7.9997,
32213
+ "step": 5285
32214
+ },
32215
+ {
32216
+ "epoch": 0.04,
32217
+ "learning_rate": 0.0004,
32218
+ "loss": 4.3768,
32219
+ "step": 5286
32220
+ },
32221
+ {
32222
+ "epoch": 0.04,
32223
+ "learning_rate": 0.0004,
32224
+ "loss": 7.3477,
32225
+ "step": 5287
32226
+ },
32227
+ {
32228
+ "epoch": 0.04,
32229
+ "learning_rate": 0.0004,
32230
+ "loss": 3.5558,
32231
+ "step": 5288
32232
+ },
32233
+ {
32234
+ "epoch": 0.04,
32235
+ "learning_rate": 0.0004,
32236
+ "loss": 5.4668,
32237
+ "step": 5289
32238
+ },
32239
+ {
32240
+ "epoch": 0.04,
32241
+ "learning_rate": 0.0004,
32242
+ "loss": 6.571,
32243
+ "step": 5290
32244
+ },
32245
+ {
32246
+ "epoch": 0.04,
32247
+ "learning_rate": 0.0004,
32248
+ "loss": 4.8902,
32249
+ "step": 5291
32250
+ },
32251
+ {
32252
+ "epoch": 0.04,
32253
+ "learning_rate": 0.0004,
32254
+ "loss": 8.3882,
32255
+ "step": 5292
32256
+ },
32257
+ {
32258
+ "epoch": 0.04,
32259
+ "learning_rate": 0.0004,
32260
+ "loss": 2.6453,
32261
+ "step": 5293
32262
+ },
32263
+ {
32264
+ "epoch": 0.04,
32265
+ "learning_rate": 0.0004,
32266
+ "loss": 3.0346,
32267
+ "step": 5294
32268
+ },
32269
+ {
32270
+ "epoch": 0.04,
32271
+ "learning_rate": 0.0004,
32272
+ "loss": 6.6004,
32273
+ "step": 5295
32274
+ },
32275
+ {
32276
+ "epoch": 0.04,
32277
+ "learning_rate": 0.0004,
32278
+ "loss": 3.2908,
32279
+ "step": 5296
32280
+ },
32281
+ {
32282
+ "epoch": 0.04,
32283
+ "learning_rate": 0.0004,
32284
+ "loss": 5.8072,
32285
+ "step": 5297
32286
+ },
32287
+ {
32288
+ "epoch": 0.04,
32289
+ "learning_rate": 0.0004,
32290
+ "loss": 5.5529,
32291
+ "step": 5298
32292
+ },
32293
+ {
32294
+ "epoch": 0.04,
32295
+ "learning_rate": 0.0004,
32296
+ "loss": 2.8983,
32297
+ "step": 5299
32298
+ },
32299
+ {
32300
+ "epoch": 0.04,
32301
+ "learning_rate": 0.0004,
32302
+ "loss": 5.3365,
32303
+ "step": 5300
32304
+ },
32305
+ {
32306
+ "epoch": 0.04,
32307
+ "learning_rate": 0.0004,
32308
+ "loss": 7.7748,
32309
+ "step": 5301
32310
+ },
32311
+ {
32312
+ "epoch": 0.04,
32313
+ "learning_rate": 0.0004,
32314
+ "loss": 8.1817,
32315
+ "step": 5302
32316
+ },
32317
+ {
32318
+ "epoch": 0.04,
32319
+ "learning_rate": 0.0004,
32320
+ "loss": 4.1362,
32321
+ "step": 5303
32322
+ },
32323
+ {
32324
+ "epoch": 0.04,
32325
+ "learning_rate": 0.0004,
32326
+ "loss": 7.4656,
32327
+ "step": 5304
32328
+ },
32329
+ {
32330
+ "epoch": 0.04,
32331
+ "learning_rate": 0.0004,
32332
+ "loss": 8.1376,
32333
+ "step": 5305
32334
+ },
32335
+ {
32336
+ "epoch": 0.04,
32337
+ "learning_rate": 0.0004,
32338
+ "loss": 8.9722,
32339
+ "step": 5306
32340
+ },
32341
+ {
32342
+ "epoch": 0.04,
32343
+ "learning_rate": 0.0004,
32344
+ "loss": 6.6619,
32345
+ "step": 5307
32346
+ },
32347
+ {
32348
+ "epoch": 0.04,
32349
+ "learning_rate": 0.0004,
32350
+ "loss": 7.3412,
32351
+ "step": 5308
32352
+ },
32353
+ {
32354
+ "epoch": 0.04,
32355
+ "learning_rate": 0.0004,
32356
+ "loss": 2.8955,
32357
+ "step": 5309
32358
+ },
32359
+ {
32360
+ "epoch": 0.04,
32361
+ "learning_rate": 0.0004,
32362
+ "loss": 8.0063,
32363
+ "step": 5310
32364
+ },
32365
+ {
32366
+ "epoch": 0.04,
32367
+ "learning_rate": 0.0004,
32368
+ "loss": 2.7085,
32369
+ "step": 5311
32370
+ },
32371
+ {
32372
+ "epoch": 0.04,
32373
+ "learning_rate": 0.0004,
32374
+ "loss": 7.3925,
32375
+ "step": 5312
32376
+ },
32377
+ {
32378
+ "epoch": 0.04,
32379
+ "learning_rate": 0.0004,
32380
+ "loss": 2.8739,
32381
+ "step": 5313
32382
+ },
32383
+ {
32384
+ "epoch": 0.04,
32385
+ "learning_rate": 0.0004,
32386
+ "loss": 5.2402,
32387
+ "step": 5314
32388
+ },
32389
+ {
32390
+ "epoch": 0.04,
32391
+ "learning_rate": 0.0004,
32392
+ "loss": 8.3129,
32393
+ "step": 5315
32394
+ },
32395
+ {
32396
+ "epoch": 0.04,
32397
+ "learning_rate": 0.0004,
32398
+ "loss": 7.4129,
32399
+ "step": 5316
32400
+ },
32401
+ {
32402
+ "epoch": 0.04,
32403
+ "learning_rate": 0.0004,
32404
+ "loss": 7.722,
32405
+ "step": 5317
32406
+ },
32407
+ {
32408
+ "epoch": 0.04,
32409
+ "learning_rate": 0.0004,
32410
+ "loss": 8.2092,
32411
+ "step": 5318
32412
+ },
32413
+ {
32414
+ "epoch": 0.04,
32415
+ "learning_rate": 0.0004,
32416
+ "loss": 5.2358,
32417
+ "step": 5319
32418
+ },
32419
+ {
32420
+ "epoch": 0.04,
32421
+ "learning_rate": 0.0004,
32422
+ "loss": 7.9978,
32423
+ "step": 5320
32424
+ },
32425
+ {
32426
+ "epoch": 0.04,
32427
+ "learning_rate": 0.0004,
32428
+ "loss": 7.9746,
32429
+ "step": 5321
32430
+ },
32431
+ {
32432
+ "epoch": 0.04,
32433
+ "learning_rate": 0.0004,
32434
+ "loss": 4.1538,
32435
+ "step": 5322
32436
+ },
32437
+ {
32438
+ "epoch": 0.04,
32439
+ "learning_rate": 0.0004,
32440
+ "loss": 6.4998,
32441
+ "step": 5323
32442
+ },
32443
+ {
32444
+ "epoch": 0.04,
32445
+ "learning_rate": 0.0004,
32446
+ "loss": 3.8847,
32447
+ "step": 5324
32448
+ },
32449
+ {
32450
+ "epoch": 0.04,
32451
+ "learning_rate": 0.0004,
32452
+ "loss": 6.3631,
32453
+ "step": 5325
32454
+ },
32455
+ {
32456
+ "epoch": 0.04,
32457
+ "learning_rate": 0.0004,
32458
+ "loss": 5.1982,
32459
+ "step": 5326
32460
+ },
32461
+ {
32462
+ "epoch": 0.04,
32463
+ "learning_rate": 0.0004,
32464
+ "loss": 3.6708,
32465
+ "step": 5327
32466
+ },
32467
+ {
32468
+ "epoch": 0.04,
32469
+ "learning_rate": 0.0004,
32470
+ "loss": 5.3822,
32471
+ "step": 5328
32472
+ },
32473
+ {
32474
+ "epoch": 0.04,
32475
+ "learning_rate": 0.0004,
32476
+ "loss": 9.2081,
32477
+ "step": 5329
32478
+ },
32479
+ {
32480
+ "epoch": 0.04,
32481
+ "learning_rate": 0.0004,
32482
+ "loss": 2.4944,
32483
+ "step": 5330
32484
+ },
32485
+ {
32486
+ "epoch": 0.04,
32487
+ "learning_rate": 0.0004,
32488
+ "loss": 4.5158,
32489
+ "step": 5331
32490
+ },
32491
+ {
32492
+ "epoch": 0.04,
32493
+ "learning_rate": 0.0004,
32494
+ "loss": 3.287,
32495
+ "step": 5332
32496
+ },
32497
+ {
32498
+ "epoch": 0.04,
32499
+ "learning_rate": 0.0004,
32500
+ "loss": 6.0359,
32501
+ "step": 5333
32502
+ },
32503
+ {
32504
+ "epoch": 0.04,
32505
+ "learning_rate": 0.0004,
32506
+ "loss": 5.2941,
32507
+ "step": 5334
32508
+ },
32509
+ {
32510
+ "epoch": 0.04,
32511
+ "learning_rate": 0.0004,
32512
+ "loss": 6.0545,
32513
+ "step": 5335
32514
+ },
32515
+ {
32516
+ "epoch": 0.04,
32517
+ "learning_rate": 0.0004,
32518
+ "loss": 5.9831,
32519
+ "step": 5336
32520
+ },
32521
+ {
32522
+ "epoch": 0.04,
32523
+ "learning_rate": 0.0004,
32524
+ "loss": 5.0593,
32525
+ "step": 5337
32526
+ },
32527
+ {
32528
+ "epoch": 0.04,
32529
+ "learning_rate": 0.0004,
32530
+ "loss": 2.3721,
32531
+ "step": 5338
32532
+ },
32533
+ {
32534
+ "epoch": 0.04,
32535
+ "learning_rate": 0.0004,
32536
+ "loss": 2.6548,
32537
+ "step": 5339
32538
+ },
32539
+ {
32540
+ "epoch": 0.04,
32541
+ "learning_rate": 0.0004,
32542
+ "loss": 7.4947,
32543
+ "step": 5340
32544
+ },
32545
+ {
32546
+ "epoch": 0.04,
32547
+ "learning_rate": 0.0004,
32548
+ "loss": 5.9871,
32549
+ "step": 5341
32550
+ },
32551
+ {
32552
+ "epoch": 0.04,
32553
+ "learning_rate": 0.0004,
32554
+ "loss": 5.8511,
32555
+ "step": 5342
32556
+ },
32557
+ {
32558
+ "epoch": 0.04,
32559
+ "learning_rate": 0.0004,
32560
+ "loss": 5.0414,
32561
+ "step": 5343
32562
+ },
32563
+ {
32564
+ "epoch": 0.04,
32565
+ "learning_rate": 0.0004,
32566
+ "loss": 4.5975,
32567
+ "step": 5344
32568
+ },
32569
+ {
32570
+ "epoch": 0.04,
32571
+ "learning_rate": 0.0004,
32572
+ "loss": 4.9401,
32573
+ "step": 5345
32574
+ },
32575
+ {
32576
+ "epoch": 0.04,
32577
+ "learning_rate": 0.0004,
32578
+ "loss": 6.1735,
32579
+ "step": 5346
32580
+ },
32581
+ {
32582
+ "epoch": 0.04,
32583
+ "learning_rate": 0.0004,
32584
+ "loss": 8.225,
32585
+ "step": 5347
32586
+ },
32587
+ {
32588
+ "epoch": 0.04,
32589
+ "learning_rate": 0.0004,
32590
+ "loss": 2.3719,
32591
+ "step": 5348
32592
+ },
32593
+ {
32594
+ "epoch": 0.04,
32595
+ "learning_rate": 0.0004,
32596
+ "loss": 7.0078,
32597
+ "step": 5349
32598
+ },
32599
+ {
32600
+ "epoch": 0.04,
32601
+ "learning_rate": 0.0004,
32602
+ "loss": 7.067,
32603
+ "step": 5350
32604
+ },
32605
+ {
32606
+ "epoch": 0.04,
32607
+ "learning_rate": 0.0004,
32608
+ "loss": 6.2358,
32609
+ "step": 5351
32610
+ },
32611
+ {
32612
+ "epoch": 0.04,
32613
+ "learning_rate": 0.0004,
32614
+ "loss": 7.7797,
32615
+ "step": 5352
32616
+ },
32617
+ {
32618
+ "epoch": 0.04,
32619
+ "learning_rate": 0.0004,
32620
+ "loss": 7.3039,
32621
+ "step": 5353
32622
+ },
32623
+ {
32624
+ "epoch": 0.04,
32625
+ "learning_rate": 0.0004,
32626
+ "loss": 7.7088,
32627
+ "step": 5354
32628
+ },
32629
+ {
32630
+ "epoch": 0.04,
32631
+ "learning_rate": 0.0004,
32632
+ "loss": 5.8908,
32633
+ "step": 5355
32634
+ },
32635
+ {
32636
+ "epoch": 0.04,
32637
+ "learning_rate": 0.0004,
32638
+ "loss": 6.053,
32639
+ "step": 5356
32640
+ },
32641
+ {
32642
+ "epoch": 0.04,
32643
+ "learning_rate": 0.0004,
32644
+ "loss": 8.1785,
32645
+ "step": 5357
32646
+ },
32647
+ {
32648
+ "epoch": 0.04,
32649
+ "learning_rate": 0.0004,
32650
+ "loss": 8.9685,
32651
+ "step": 5358
32652
+ },
32653
+ {
32654
+ "epoch": 0.04,
32655
+ "learning_rate": 0.0004,
32656
+ "loss": 3.5938,
32657
+ "step": 5359
32658
+ },
32659
+ {
32660
+ "epoch": 0.04,
32661
+ "learning_rate": 0.0004,
32662
+ "loss": 3.55,
32663
+ "step": 5360
32664
+ },
32665
+ {
32666
+ "epoch": 0.04,
32667
+ "learning_rate": 0.0004,
32668
+ "loss": 8.9066,
32669
+ "step": 5361
32670
+ },
32671
+ {
32672
+ "epoch": 0.04,
32673
+ "learning_rate": 0.0004,
32674
+ "loss": 7.1162,
32675
+ "step": 5362
32676
+ },
32677
+ {
32678
+ "epoch": 0.04,
32679
+ "learning_rate": 0.0004,
32680
+ "loss": 4.3855,
32681
+ "step": 5363
32682
+ },
32683
+ {
32684
+ "epoch": 0.04,
32685
+ "learning_rate": 0.0004,
32686
+ "loss": 7.3739,
32687
+ "step": 5364
32688
+ },
32689
+ {
32690
+ "epoch": 0.04,
32691
+ "learning_rate": 0.0004,
32692
+ "loss": 4.6735,
32693
+ "step": 5365
32694
+ },
32695
+ {
32696
+ "epoch": 0.04,
32697
+ "learning_rate": 0.0004,
32698
+ "loss": 7.4358,
32699
+ "step": 5366
32700
+ },
32701
+ {
32702
+ "epoch": 0.04,
32703
+ "learning_rate": 0.0004,
32704
+ "loss": 8.0145,
32705
+ "step": 5367
32706
+ },
32707
+ {
32708
+ "epoch": 0.04,
32709
+ "learning_rate": 0.0004,
32710
+ "loss": 6.3903,
32711
+ "step": 5368
32712
+ },
32713
+ {
32714
+ "epoch": 0.04,
32715
+ "learning_rate": 0.0004,
32716
+ "loss": 6.7513,
32717
+ "step": 5369
32718
+ },
32719
+ {
32720
+ "epoch": 0.04,
32721
+ "learning_rate": 0.0004,
32722
+ "loss": 6.3265,
32723
+ "step": 5370
32724
+ },
32725
+ {
32726
+ "epoch": 0.04,
32727
+ "learning_rate": 0.0004,
32728
+ "loss": 6.0655,
32729
+ "step": 5371
32730
+ },
32731
+ {
32732
+ "epoch": 0.04,
32733
+ "learning_rate": 0.0004,
32734
+ "loss": 5.046,
32735
+ "step": 5372
32736
+ },
32737
+ {
32738
+ "epoch": 0.04,
32739
+ "learning_rate": 0.0004,
32740
+ "loss": 5.9508,
32741
+ "step": 5373
32742
+ },
32743
+ {
32744
+ "epoch": 0.04,
32745
+ "learning_rate": 0.0004,
32746
+ "loss": 4.7946,
32747
+ "step": 5374
32748
+ },
32749
+ {
32750
+ "epoch": 0.04,
32751
+ "learning_rate": 0.0004,
32752
+ "loss": 6.4541,
32753
+ "step": 5375
32754
+ },
32755
+ {
32756
+ "epoch": 0.04,
32757
+ "learning_rate": 0.0004,
32758
+ "loss": 6.6098,
32759
+ "step": 5376
32760
+ },
32761
+ {
32762
+ "epoch": 0.04,
32763
+ "learning_rate": 0.0004,
32764
+ "loss": 4.124,
32765
+ "step": 5377
32766
+ },
32767
+ {
32768
+ "epoch": 0.04,
32769
+ "learning_rate": 0.0004,
32770
+ "loss": 6.539,
32771
+ "step": 5378
32772
+ },
32773
+ {
32774
+ "epoch": 0.04,
32775
+ "learning_rate": 0.0004,
32776
+ "loss": 7.8777,
32777
+ "step": 5379
32778
+ },
32779
+ {
32780
+ "epoch": 0.04,
32781
+ "learning_rate": 0.0004,
32782
+ "loss": 6.6315,
32783
+ "step": 5380
32784
+ },
32785
+ {
32786
+ "epoch": 0.04,
32787
+ "learning_rate": 0.0004,
32788
+ "loss": 7.1006,
32789
+ "step": 5381
32790
+ },
32791
+ {
32792
+ "epoch": 0.04,
32793
+ "learning_rate": 0.0004,
32794
+ "loss": 5.1972,
32795
+ "step": 5382
32796
+ },
32797
+ {
32798
+ "epoch": 0.04,
32799
+ "learning_rate": 0.0004,
32800
+ "loss": 8.1427,
32801
+ "step": 5383
32802
+ },
32803
+ {
32804
+ "epoch": 0.04,
32805
+ "learning_rate": 0.0004,
32806
+ "loss": 6.1585,
32807
+ "step": 5384
32808
+ },
32809
+ {
32810
+ "epoch": 0.04,
32811
+ "learning_rate": 0.0004,
32812
+ "loss": 2.6632,
32813
+ "step": 5385
32814
+ },
32815
+ {
32816
+ "epoch": 0.04,
32817
+ "learning_rate": 0.0004,
32818
+ "loss": 3.2398,
32819
+ "step": 5386
32820
+ },
32821
+ {
32822
+ "epoch": 0.04,
32823
+ "learning_rate": 0.0004,
32824
+ "loss": 2.3961,
32825
+ "step": 5387
32826
+ },
32827
+ {
32828
+ "epoch": 0.04,
32829
+ "learning_rate": 0.0004,
32830
+ "loss": 4.7233,
32831
+ "step": 5388
32832
+ },
32833
+ {
32834
+ "epoch": 0.04,
32835
+ "learning_rate": 0.0004,
32836
+ "loss": 7.0959,
32837
+ "step": 5389
32838
+ },
32839
+ {
32840
+ "epoch": 0.04,
32841
+ "learning_rate": 0.0004,
32842
+ "loss": 5.5001,
32843
+ "step": 5390
32844
+ },
32845
+ {
32846
+ "epoch": 0.04,
32847
+ "learning_rate": 0.0004,
32848
+ "loss": 3.0294,
32849
+ "step": 5391
32850
+ },
32851
+ {
32852
+ "epoch": 0.04,
32853
+ "learning_rate": 0.0004,
32854
+ "loss": 7.0155,
32855
+ "step": 5392
32856
+ },
32857
+ {
32858
+ "epoch": 0.04,
32859
+ "learning_rate": 0.0004,
32860
+ "loss": 3.2199,
32861
+ "step": 5393
32862
+ },
32863
+ {
32864
+ "epoch": 0.04,
32865
+ "learning_rate": 0.0004,
32866
+ "loss": 3.3572,
32867
+ "step": 5394
32868
+ },
32869
+ {
32870
+ "epoch": 0.04,
32871
+ "learning_rate": 0.0004,
32872
+ "loss": 7.7692,
32873
+ "step": 5395
32874
+ },
32875
+ {
32876
+ "epoch": 0.04,
32877
+ "learning_rate": 0.0004,
32878
+ "loss": 7.6206,
32879
+ "step": 5396
32880
+ },
32881
+ {
32882
+ "epoch": 0.04,
32883
+ "learning_rate": 0.0004,
32884
+ "loss": 4.5941,
32885
+ "step": 5397
32886
+ },
32887
+ {
32888
+ "epoch": 0.04,
32889
+ "learning_rate": 0.0004,
32890
+ "loss": 3.6666,
32891
+ "step": 5398
32892
+ },
32893
+ {
32894
+ "epoch": 0.04,
32895
+ "learning_rate": 0.0004,
32896
+ "loss": 2.2717,
32897
+ "step": 5399
32898
+ },
32899
+ {
32900
+ "epoch": 0.04,
32901
+ "learning_rate": 0.0004,
32902
+ "loss": 4.9048,
32903
+ "step": 5400
32904
+ },
32905
+ {
32906
+ "epoch": 0.04,
32907
+ "eval_loss": 6.526280403137207,
32908
+ "eval_runtime": 22.3472,
32909
+ "eval_samples_per_second": 2.237,
32910
+ "eval_steps_per_second": 1.119,
32911
+ "step": 5400
32912
+ },
32913
+ {
32914
+ "epoch": 0.04,
32915
+ "mmlu_eval_accuracy": 0.2525477994227994,
32916
+ "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182,
32917
+ "mmlu_eval_accuracy_anatomy": 0.07142857142857142,
32918
+ "mmlu_eval_accuracy_astronomy": 0.3125,
32919
+ "mmlu_eval_accuracy_business_ethics": 0.4444444444444444,
32920
+ "mmlu_loss": 3.8057762241363524,
32921
+ "step": 5400
32922
+ },
32923
+ {
32924
+ "epoch": 0.04,
32925
+ "step": 5400,
32926
+ "total_flos": 8.918950910784307e+16,
32927
+ "train_loss": 0.6445872698006807,
32928
+ "train_runtime": 1748.3273,
32929
+ "train_samples_per_second": 17.159,
32930
+ "train_steps_per_second": 17.159
32931
  }
32932
  ],
32933
  "max_steps": 30000,
32934
  "num_train_epochs": 1,
32935
+ "total_flos": 8.918950910784307e+16,
32936
  "trial_name": null,
32937
  "trial_params": null
32938
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd2e97ea6d2b8e9c0ed7efbb032ce79458292aa99ced1bbeb7b777b9663a324f
3
  size 6011
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7d931ebfbcece1009124b9eae98d1a465edd703240c0655ee9bb17db395973
3
  size 6011