edpowers commited on
Commit
bb7f189
·
verified ·
1 Parent(s): 852cb62

End of training

Browse files
README.md CHANGED
@@ -21,7 +21,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.2003
25
 
26
  ## Model description
27
 
@@ -55,46 +55,46 @@ The following hyperparameters were used during training:
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:------:|:----:|:---------------:|
58
- | 0.9474 | 0.1479 | 25 | 0.6950 |
59
- | 0.6403 | 0.2959 | 50 | 0.6291 |
60
- | 0.6184 | 0.4438 | 75 | 0.6191 |
61
- | 0.6041 | 0.5917 | 100 | 0.6083 |
62
- | 0.6023 | 0.7396 | 125 | 0.6047 |
63
- | 0.6086 | 0.8876 | 150 | 0.6014 |
64
- | 0.5545 | 1.0355 | 175 | 0.6375 |
65
- | 0.4278 | 1.1834 | 200 | 0.6235 |
66
- | 0.4297 | 1.3314 | 225 | 0.6276 |
67
- | 0.4172 | 1.4793 | 250 | 0.6314 |
68
- | 0.4273 | 1.6272 | 275 | 0.6375 |
69
- | 0.4264 | 1.7751 | 300 | 0.6345 |
70
- | 0.428 | 1.9231 | 325 | 0.6296 |
71
- | 0.3524 | 2.0710 | 350 | 0.7067 |
72
- | 0.2697 | 2.2189 | 375 | 0.7314 |
73
- | 0.2645 | 2.3669 | 400 | 0.7255 |
74
- | 0.2778 | 2.5148 | 425 | 0.7221 |
75
- | 0.2687 | 2.6627 | 450 | 0.7373 |
76
- | 0.2748 | 2.8107 | 475 | 0.7250 |
77
- | 0.2737 | 2.9586 | 500 | 0.7214 |
78
- | 0.1848 | 3.1065 | 525 | 0.8414 |
79
- | 0.1535 | 3.2544 | 550 | 0.8438 |
80
- | 0.1565 | 3.4024 | 575 | 0.8479 |
81
- | 0.1583 | 3.5503 | 600 | 0.8719 |
82
- | 0.1537 | 3.6982 | 625 | 0.8497 |
83
- | 0.1611 | 3.8462 | 650 | 0.8587 |
84
- | 0.16 | 3.9941 | 675 | 0.8714 |
85
- | 0.0913 | 4.1420 | 700 | 1.0194 |
86
- | 0.0841 | 4.2899 | 725 | 1.0429 |
87
- | 0.0836 | 4.4379 | 750 | 1.0544 |
88
- | 0.0842 | 4.5858 | 775 | 1.0432 |
89
- | 0.081 | 4.7337 | 800 | 1.0386 |
90
- | 0.0848 | 4.8817 | 825 | 1.0705 |
91
- | 0.0776 | 5.0296 | 850 | 1.0773 |
92
- | 0.0485 | 5.1775 | 875 | 1.1910 |
93
- | 0.0481 | 5.3254 | 900 | 1.1800 |
94
- | 0.0481 | 5.4734 | 925 | 1.1915 |
95
- | 0.0467 | 5.6213 | 950 | 1.1996 |
96
- | 0.0468 | 5.7692 | 975 | 1.1961 |
97
- | 0.0449 | 5.9172 | 1000 | 1.2003 |
98
 
99
 
100
  ### Framework versions
 
21
 
22
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 1.1832
25
 
26
  ## Model description
27
 
 
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:------:|:----:|:---------------:|
58
+ | 0.9391 | 0.1479 | 25 | 0.6653 |
59
+ | 0.6138 | 0.2959 | 50 | 0.6126 |
60
+ | 0.6039 | 0.4438 | 75 | 0.6061 |
61
+ | 0.5927 | 0.5917 | 100 | 0.5998 |
62
+ | 0.5973 | 0.7396 | 125 | 0.5946 |
63
+ | 0.602 | 0.8876 | 150 | 0.5943 |
64
+ | 0.547 | 1.0355 | 175 | 0.6319 |
65
+ | 0.4239 | 1.1834 | 200 | 0.6169 |
66
+ | 0.4301 | 1.3314 | 225 | 0.6158 |
67
+ | 0.4176 | 1.4793 | 250 | 0.6193 |
68
+ | 0.4295 | 1.6272 | 275 | 0.6242 |
69
+ | 0.4252 | 1.7751 | 300 | 0.6265 |
70
+ | 0.4252 | 1.9231 | 325 | 0.6264 |
71
+ | 0.3591 | 2.0710 | 350 | 0.6893 |
72
+ | 0.2758 | 2.2189 | 375 | 0.7153 |
73
+ | 0.2702 | 2.3669 | 400 | 0.7170 |
74
+ | 0.2797 | 2.5148 | 425 | 0.7173 |
75
+ | 0.2727 | 2.6627 | 450 | 0.7144 |
76
+ | 0.2817 | 2.8107 | 475 | 0.7169 |
77
+ | 0.2798 | 2.9586 | 500 | 0.7016 |
78
+ | 0.1922 | 3.1065 | 525 | 0.8090 |
79
+ | 0.16 | 3.2544 | 550 | 0.8373 |
80
+ | 0.1623 | 3.4024 | 575 | 0.8372 |
81
+ | 0.1632 | 3.5503 | 600 | 0.8402 |
82
+ | 0.1618 | 3.6982 | 625 | 0.8558 |
83
+ | 0.1732 | 3.8462 | 650 | 0.8581 |
84
+ | 0.1687 | 3.9941 | 675 | 0.8611 |
85
+ | 0.0961 | 4.1420 | 700 | 0.9902 |
86
+ | 0.0879 | 4.2899 | 725 | 1.0102 |
87
+ | 0.0899 | 4.4379 | 750 | 1.0345 |
88
+ | 0.0899 | 4.5858 | 775 | 1.0256 |
89
+ | 0.0882 | 4.7337 | 800 | 1.0273 |
90
+ | 0.0893 | 4.8817 | 825 | 1.0559 |
91
+ | 0.0824 | 5.0296 | 850 | 1.0753 |
92
+ | 0.052 | 5.1775 | 875 | 1.1582 |
93
+ | 0.052 | 5.3254 | 900 | 1.1643 |
94
+ | 0.0526 | 5.4734 | 925 | 1.1923 |
95
+ | 0.0497 | 5.6213 | 950 | 1.1759 |
96
+ | 0.0496 | 5.7692 | 975 | 1.1812 |
97
+ | 0.0477 | 5.9172 | 1000 | 1.1832 |
98
 
99
 
100
  ### Framework versions
adapter_config.json CHANGED
@@ -20,11 +20,11 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "o_proj",
 
24
  "v_proj",
25
- "k_proj",
26
- "gate_proj",
27
- "q_proj"
28
  ],
29
  "task_type": "CAUSAL_LM",
30
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "gate_proj",
24
  "o_proj",
25
+ "q_proj",
26
  "v_proj",
27
+ "k_proj"
 
 
28
  ],
29
  "task_type": "CAUSAL_LM",
30
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35f0093b376116b7612ec6995570455d70c9eca1a2b9373d52d74bc000d2d7e6
3
- size 708913608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68fa615f2e2431c2a4128c632f3e5b71376ad7a9a3fbb213177c8693f0e6db6d
3
+ size 616639296
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 5.923076923076923,
3
- "eval_loss": 1.2003010511398315,
4
- "eval_runtime": 30.7157,
5
  "eval_samples": 169,
6
- "eval_samples_per_second": 2.898,
7
- "eval_steps_per_second": 0.391,
8
- "total_flos": 1.7606154086724403e+17,
9
- "train_loss": 4.533969319902815e-05,
10
- "train_runtime": 4.6313,
11
  "train_samples": 1346,
12
- "train_samples_per_second": 863.692,
13
- "train_steps_per_second": 215.923
14
  }
 
1
  {
2
+ "epoch": 5.9171597633136095,
3
+ "eval_loss": 1.1831614971160889,
4
+ "eval_runtime": 5.5457,
5
  "eval_samples": 169,
6
+ "eval_samples_per_second": 16.049,
7
+ "eval_steps_per_second": 2.164,
8
+ "total_flos": 1.75885655212032e+17,
9
+ "train_loss": 0.2793775268793106,
10
+ "train_runtime": 1218.5958,
11
  "train_samples": 1346,
12
+ "train_samples_per_second": 3.282,
13
+ "train_steps_per_second": 0.821
14
  }
config.json CHANGED
@@ -16,21 +16,6 @@
16
  "num_hidden_layers": 32,
17
  "num_key_value_heads": 8,
18
  "pad_token_id": 32001,
19
- "quantization_config": {
20
- "_load_in_4bit": true,
21
- "_load_in_8bit": false,
22
- "bnb_4bit_compute_dtype": "bfloat16",
23
- "bnb_4bit_quant_storage": "uint8",
24
- "bnb_4bit_quant_type": "nf4",
25
- "bnb_4bit_use_double_quant": false,
26
- "llm_int8_enable_fp32_cpu_offload": false,
27
- "llm_int8_has_fp16_weight": false,
28
- "llm_int8_skip_modules": null,
29
- "llm_int8_threshold": 6.0,
30
- "load_in_4bit": true,
31
- "load_in_8bit": false,
32
- "quant_method": "bitsandbytes"
33
- },
34
  "rms_norm_eps": 1e-05,
35
  "rope_theta": 1000000.0,
36
  "sliding_window": null,
 
16
  "num_hidden_layers": 32,
17
  "num_key_value_heads": 8,
18
  "pad_token_id": 32001,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "rms_norm_eps": 1e-05,
20
  "rope_theta": 1000000.0,
21
  "sliding_window": null,
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.923076923076923,
3
- "eval_loss": 1.2003010511398315,
4
- "eval_runtime": 30.7157,
5
  "eval_samples": 169,
6
- "eval_samples_per_second": 2.898,
7
- "eval_steps_per_second": 0.391
8
  }
 
1
  {
2
+ "epoch": 5.9171597633136095,
3
+ "eval_loss": 1.1831614971160889,
4
+ "eval_runtime": 5.5457,
5
  "eval_samples": 169,
6
+ "eval_samples_per_second": 16.049,
7
+ "eval_steps_per_second": 2.164
8
  }
special_tokens_map.json CHANGED
@@ -1,19 +1,23 @@
1
  {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "</s>",
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
21
  "unk_token": {
22
  "content": "<unk>",
23
  "lstrip": false,
tokenizer.json CHANGED
@@ -32,21 +32,21 @@
32
  },
33
  {
34
  "id": 32000,
35
- "content": "<new_token1>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
- "normalized": true,
40
- "special": false
41
  },
42
  {
43
  "id": 32001,
44
- "content": "<new_token2>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
- "normalized": true,
49
- "special": false
50
  }
51
  ],
52
  "normalizer": {
 
32
  },
33
  {
34
  "id": 32000,
35
+ "content": "<|im_start|>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
  },
42
  {
43
  "id": 32001,
44
+ "content": "<|im_end|>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
  }
51
  ],
52
  "normalizer": {
tokenizer_config.json CHANGED
@@ -27,30 +27,33 @@
27
  "special": true
28
  },
29
  "32000": {
30
- "content": "<new_token1>",
31
  "lstrip": false,
32
- "normalized": true,
33
  "rstrip": false,
34
  "single_word": false,
35
- "special": false
36
  },
37
  "32001": {
38
- "content": "<new_token2>",
39
  "lstrip": false,
40
- "normalized": true,
41
  "rstrip": false,
42
  "single_word": false,
43
- "special": false
44
  }
45
  },
46
- "additional_special_tokens": [],
47
- "bos_token": "<s>",
48
- "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
 
 
 
49
  "clean_up_tokenization_spaces": false,
50
- "eos_token": "</s>",
51
  "legacy": true,
52
  "model_max_length": 1000000000000000019884624838656,
53
- "pad_token": "</s>",
54
  "sp_model_kwargs": {},
55
  "spaces_between_special_tokens": false,
56
  "tokenizer_class": "LlamaTokenizer",
 
27
  "special": true
28
  },
29
  "32000": {
30
+ "content": "<|im_start|>",
31
  "lstrip": false,
32
+ "normalized": false,
33
  "rstrip": false,
34
  "single_word": false,
35
+ "special": true
36
  },
37
  "32001": {
38
+ "content": "<|im_end|>",
39
  "lstrip": false,
40
+ "normalized": false,
41
  "rstrip": false,
42
  "single_word": false,
43
+ "special": true
44
  }
45
  },
46
+ "additional_special_tokens": [
47
+ "<|im_start|>",
48
+ "<|im_end|>"
49
+ ],
50
+ "bos_token": "<|im_start|>",
51
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
52
  "clean_up_tokenization_spaces": false,
53
+ "eos_token": "<|im_end|>",
54
  "legacy": true,
55
  "model_max_length": 1000000000000000019884624838656,
56
+ "pad_token": "<|im_end|>",
57
  "sp_model_kwargs": {},
58
  "spaces_between_special_tokens": false,
59
  "tokenizer_class": "LlamaTokenizer",
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 5.923076923076923,
3
- "total_flos": 1.7606154086724403e+17,
4
- "train_loss": 4.533969319902815e-05,
5
- "train_runtime": 4.6313,
6
  "train_samples": 1346,
7
- "train_samples_per_second": 863.692,
8
- "train_steps_per_second": 215.923
9
  }
 
1
  {
2
+ "epoch": 5.9171597633136095,
3
+ "total_flos": 1.75885655212032e+17,
4
+ "train_loss": 0.2793775268793106,
5
+ "train_runtime": 1218.5958,
6
  "train_samples": 1346,
7
+ "train_samples_per_second": 3.282,
8
+ "train_steps_per_second": 0.821
9
  }
trainer_state.json CHANGED
@@ -1,621 +1,621 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.923076923076923,
5
  "eval_steps": 25,
6
- "global_step": 1001,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.14792899408284024,
13
- "grad_norm": 0.7053780555725098,
14
  "learning_rate": 0.0001951951951951952,
15
- "loss": 0.9474,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.14792899408284024,
20
- "eval_loss": 0.6950487494468689,
21
- "eval_runtime": 31.2023,
22
- "eval_samples_per_second": 2.852,
23
- "eval_steps_per_second": 0.385,
24
  "step": 25
25
  },
26
  {
27
  "epoch": 0.2958579881656805,
28
- "grad_norm": 0.6385655403137207,
29
  "learning_rate": 0.0001901901901901902,
30
- "loss": 0.6403,
31
  "step": 50
32
  },
33
  {
34
  "epoch": 0.2958579881656805,
35
- "eval_loss": 0.6290514469146729,
36
- "eval_runtime": 31.6217,
37
- "eval_samples_per_second": 2.815,
38
- "eval_steps_per_second": 0.379,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.4437869822485207,
43
- "grad_norm": 0.7015706896781921,
44
  "learning_rate": 0.0001851851851851852,
45
- "loss": 0.6184,
46
  "step": 75
47
  },
48
  {
49
  "epoch": 0.4437869822485207,
50
- "eval_loss": 0.6191244125366211,
51
- "eval_runtime": 31.7723,
52
- "eval_samples_per_second": 2.801,
53
- "eval_steps_per_second": 0.378,
54
  "step": 75
55
  },
56
  {
57
  "epoch": 0.591715976331361,
58
- "grad_norm": 0.6220183372497559,
59
  "learning_rate": 0.00018018018018018018,
60
- "loss": 0.6041,
61
  "step": 100
62
  },
63
  {
64
  "epoch": 0.591715976331361,
65
- "eval_loss": 0.608259379863739,
66
- "eval_runtime": 31.7817,
67
- "eval_samples_per_second": 2.8,
68
- "eval_steps_per_second": 0.378,
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.7396449704142012,
73
- "grad_norm": 0.6809254884719849,
74
  "learning_rate": 0.0001751751751751752,
75
- "loss": 0.6023,
76
  "step": 125
77
  },
78
  {
79
  "epoch": 0.7396449704142012,
80
- "eval_loss": 0.604732871055603,
81
- "eval_runtime": 31.6489,
82
- "eval_samples_per_second": 2.812,
83
- "eval_steps_per_second": 0.379,
84
  "step": 125
85
  },
86
  {
87
  "epoch": 0.8875739644970414,
88
- "grad_norm": 0.6159196496009827,
89
  "learning_rate": 0.0001701701701701702,
90
- "loss": 0.6086,
91
  "step": 150
92
  },
93
  {
94
  "epoch": 0.8875739644970414,
95
- "eval_loss": 0.6013623476028442,
96
- "eval_runtime": 31.6492,
97
- "eval_samples_per_second": 2.812,
98
- "eval_steps_per_second": 0.379,
99
  "step": 150
100
  },
101
  {
102
  "epoch": 1.0355029585798816,
103
- "grad_norm": 0.5559250116348267,
104
  "learning_rate": 0.00016516516516516518,
105
- "loss": 0.5545,
106
  "step": 175
107
  },
108
  {
109
  "epoch": 1.0355029585798816,
110
- "eval_loss": 0.6374889612197876,
111
- "eval_runtime": 31.6397,
112
- "eval_samples_per_second": 2.813,
113
- "eval_steps_per_second": 0.379,
114
  "step": 175
115
  },
116
  {
117
  "epoch": 1.183431952662722,
118
- "grad_norm": 0.5993044972419739,
119
  "learning_rate": 0.00016016016016016018,
120
- "loss": 0.4278,
121
  "step": 200
122
  },
123
  {
124
  "epoch": 1.183431952662722,
125
- "eval_loss": 0.6234655380249023,
126
- "eval_runtime": 31.6312,
127
- "eval_samples_per_second": 2.814,
128
- "eval_steps_per_second": 0.379,
129
  "step": 200
130
  },
131
  {
132
  "epoch": 1.331360946745562,
133
- "grad_norm": 0.689406156539917,
134
  "learning_rate": 0.00015515515515515516,
135
- "loss": 0.4297,
136
  "step": 225
137
  },
138
  {
139
  "epoch": 1.331360946745562,
140
- "eval_loss": 0.6275980472564697,
141
- "eval_runtime": 31.6369,
142
- "eval_samples_per_second": 2.813,
143
- "eval_steps_per_second": 0.379,
144
  "step": 225
145
  },
146
  {
147
  "epoch": 1.4792899408284024,
148
- "grad_norm": 0.7030369639396667,
149
  "learning_rate": 0.00015015015015015014,
150
- "loss": 0.4172,
151
  "step": 250
152
  },
153
  {
154
  "epoch": 1.4792899408284024,
155
- "eval_loss": 0.6314178705215454,
156
- "eval_runtime": 31.6517,
157
- "eval_samples_per_second": 2.812,
158
- "eval_steps_per_second": 0.379,
159
  "step": 250
160
  },
161
  {
162
  "epoch": 1.6272189349112427,
163
- "grad_norm": 0.7457050085067749,
164
  "learning_rate": 0.00014514514514514515,
165
- "loss": 0.4273,
166
  "step": 275
167
  },
168
  {
169
  "epoch": 1.6272189349112427,
170
- "eval_loss": 0.6374988555908203,
171
- "eval_runtime": 31.6395,
172
- "eval_samples_per_second": 2.813,
173
- "eval_steps_per_second": 0.379,
174
  "step": 275
175
  },
176
  {
177
  "epoch": 1.7751479289940828,
178
- "grad_norm": 0.6606324315071106,
179
  "learning_rate": 0.00014014014014014013,
180
- "loss": 0.4264,
181
  "step": 300
182
  },
183
  {
184
  "epoch": 1.7751479289940828,
185
- "eval_loss": 0.6344882845878601,
186
- "eval_runtime": 31.6619,
187
- "eval_samples_per_second": 2.811,
188
- "eval_steps_per_second": 0.379,
189
  "step": 300
190
  },
191
  {
192
  "epoch": 1.9230769230769231,
193
- "grad_norm": 0.675614058971405,
194
  "learning_rate": 0.00013513513513513514,
195
- "loss": 0.428,
196
  "step": 325
197
  },
198
  {
199
  "epoch": 1.9230769230769231,
200
- "eval_loss": 0.6296113133430481,
201
- "eval_runtime": 31.6627,
202
- "eval_samples_per_second": 2.811,
203
- "eval_steps_per_second": 0.379,
204
  "step": 325
205
  },
206
  {
207
  "epoch": 2.0710059171597632,
208
- "grad_norm": 0.7311059832572937,
209
  "learning_rate": 0.00013013013013013014,
210
- "loss": 0.3524,
211
  "step": 350
212
  },
213
  {
214
  "epoch": 2.0710059171597632,
215
- "eval_loss": 0.7067192792892456,
216
- "eval_runtime": 31.651,
217
- "eval_samples_per_second": 2.812,
218
- "eval_steps_per_second": 0.379,
219
  "step": 350
220
  },
221
  {
222
  "epoch": 2.2189349112426036,
223
- "grad_norm": 0.7563914060592651,
224
  "learning_rate": 0.00012512512512512512,
225
- "loss": 0.2697,
226
  "step": 375
227
  },
228
  {
229
  "epoch": 2.2189349112426036,
230
- "eval_loss": 0.7313967943191528,
231
- "eval_runtime": 31.645,
232
- "eval_samples_per_second": 2.812,
233
- "eval_steps_per_second": 0.379,
234
  "step": 375
235
  },
236
  {
237
  "epoch": 2.366863905325444,
238
- "grad_norm": 0.6278096437454224,
239
  "learning_rate": 0.00012012012012012013,
240
- "loss": 0.2645,
241
  "step": 400
242
  },
243
  {
244
  "epoch": 2.366863905325444,
245
- "eval_loss": 0.725497841835022,
246
- "eval_runtime": 31.6442,
247
- "eval_samples_per_second": 2.813,
248
- "eval_steps_per_second": 0.379,
249
  "step": 400
250
  },
251
  {
252
  "epoch": 2.5147928994082838,
253
- "grad_norm": 0.782738447189331,
254
  "learning_rate": 0.00011511511511511512,
255
- "loss": 0.2778,
256
  "step": 425
257
  },
258
  {
259
  "epoch": 2.5147928994082838,
260
- "eval_loss": 0.7220944166183472,
261
- "eval_runtime": 31.6506,
262
- "eval_samples_per_second": 2.812,
263
- "eval_steps_per_second": 0.379,
264
  "step": 425
265
  },
266
  {
267
  "epoch": 2.662721893491124,
268
- "grad_norm": 0.7897526025772095,
269
  "learning_rate": 0.00011011011011011012,
270
- "loss": 0.2687,
271
  "step": 450
272
  },
273
  {
274
  "epoch": 2.662721893491124,
275
- "eval_loss": 0.7373032569885254,
276
- "eval_runtime": 31.668,
277
- "eval_samples_per_second": 2.81,
278
- "eval_steps_per_second": 0.379,
279
  "step": 450
280
  },
281
  {
282
  "epoch": 2.8106508875739644,
283
- "grad_norm": 0.8417075276374817,
284
  "learning_rate": 0.00010510510510510511,
285
- "loss": 0.2748,
286
  "step": 475
287
  },
288
  {
289
  "epoch": 2.8106508875739644,
290
- "eval_loss": 0.7250338196754456,
291
- "eval_runtime": 31.7591,
292
- "eval_samples_per_second": 2.802,
293
- "eval_steps_per_second": 0.378,
294
  "step": 475
295
  },
296
  {
297
  "epoch": 2.9585798816568047,
298
- "grad_norm": 0.672287106513977,
299
  "learning_rate": 0.00010010010010010012,
300
- "loss": 0.2737,
301
  "step": 500
302
  },
303
  {
304
  "epoch": 2.9585798816568047,
305
- "eval_loss": 0.7213594913482666,
306
- "eval_runtime": 31.6416,
307
- "eval_samples_per_second": 2.813,
308
- "eval_steps_per_second": 0.379,
309
  "step": 500
310
  },
311
  {
312
  "epoch": 3.106508875739645,
313
- "grad_norm": 0.658898115158081,
314
  "learning_rate": 9.50950950950951e-05,
315
- "loss": 0.1848,
316
  "step": 525
317
  },
318
  {
319
  "epoch": 3.106508875739645,
320
- "eval_loss": 0.8414345979690552,
321
- "eval_runtime": 31.6513,
322
- "eval_samples_per_second": 2.812,
323
- "eval_steps_per_second": 0.379,
324
  "step": 525
325
  },
326
  {
327
  "epoch": 3.2544378698224854,
328
- "grad_norm": 0.6711578369140625,
329
  "learning_rate": 9.009009009009009e-05,
330
- "loss": 0.1535,
331
  "step": 550
332
  },
333
  {
334
  "epoch": 3.2544378698224854,
335
- "eval_loss": 0.8437659740447998,
336
- "eval_runtime": 31.6588,
337
- "eval_samples_per_second": 2.811,
338
- "eval_steps_per_second": 0.379,
339
  "step": 550
340
  },
341
  {
342
  "epoch": 3.4023668639053253,
343
- "grad_norm": 0.6709449887275696,
344
  "learning_rate": 8.50850850850851e-05,
345
- "loss": 0.1565,
346
  "step": 575
347
  },
348
  {
349
  "epoch": 3.4023668639053253,
350
- "eval_loss": 0.847898006439209,
351
- "eval_runtime": 31.6337,
352
- "eval_samples_per_second": 2.813,
353
- "eval_steps_per_second": 0.379,
354
  "step": 575
355
  },
356
  {
357
  "epoch": 3.5502958579881656,
358
- "grad_norm": 0.7851375937461853,
359
  "learning_rate": 8.008008008008009e-05,
360
- "loss": 0.1583,
361
  "step": 600
362
  },
363
  {
364
  "epoch": 3.5502958579881656,
365
- "eval_loss": 0.8719269633293152,
366
- "eval_runtime": 31.6367,
367
- "eval_samples_per_second": 2.813,
368
- "eval_steps_per_second": 0.379,
369
  "step": 600
370
  },
371
  {
372
  "epoch": 3.698224852071006,
373
- "grad_norm": 0.7410476207733154,
374
  "learning_rate": 7.507507507507507e-05,
375
- "loss": 0.1537,
376
  "step": 625
377
  },
378
  {
379
  "epoch": 3.698224852071006,
380
- "eval_loss": 0.8496631383895874,
381
- "eval_runtime": 31.6614,
382
- "eval_samples_per_second": 2.811,
383
- "eval_steps_per_second": 0.379,
384
  "step": 625
385
  },
386
  {
387
  "epoch": 3.8461538461538463,
388
- "grad_norm": 0.7157964110374451,
389
  "learning_rate": 7.007007007007007e-05,
390
- "loss": 0.1611,
391
  "step": 650
392
  },
393
  {
394
  "epoch": 3.8461538461538463,
395
- "eval_loss": 0.8586809039115906,
396
- "eval_runtime": 31.6397,
397
- "eval_samples_per_second": 2.813,
398
- "eval_steps_per_second": 0.379,
399
  "step": 650
400
  },
401
  {
402
  "epoch": 3.994082840236686,
403
- "grad_norm": 0.7757616639137268,
404
  "learning_rate": 6.506506506506507e-05,
405
- "loss": 0.16,
406
  "step": 675
407
  },
408
  {
409
  "epoch": 3.994082840236686,
410
- "eval_loss": 0.8713619112968445,
411
- "eval_runtime": 31.6525,
412
- "eval_samples_per_second": 2.812,
413
- "eval_steps_per_second": 0.379,
414
  "step": 675
415
  },
416
  {
417
  "epoch": 4.1420118343195265,
418
- "grad_norm": 0.5490134358406067,
419
  "learning_rate": 6.0060060060060066e-05,
420
- "loss": 0.0913,
421
  "step": 700
422
  },
423
  {
424
  "epoch": 4.1420118343195265,
425
- "eval_loss": 1.0193753242492676,
426
- "eval_runtime": 31.652,
427
- "eval_samples_per_second": 2.812,
428
- "eval_steps_per_second": 0.379,
429
  "step": 700
430
  },
431
  {
432
  "epoch": 4.289940828402367,
433
- "grad_norm": 0.667753279209137,
434
  "learning_rate": 5.505505505505506e-05,
435
- "loss": 0.0841,
436
  "step": 725
437
  },
438
  {
439
  "epoch": 4.289940828402367,
440
- "eval_loss": 1.0428720712661743,
441
- "eval_runtime": 31.6444,
442
- "eval_samples_per_second": 2.813,
443
- "eval_steps_per_second": 0.379,
444
  "step": 725
445
  },
446
  {
447
  "epoch": 4.437869822485207,
448
- "grad_norm": 0.5531997084617615,
449
  "learning_rate": 5.005005005005006e-05,
450
- "loss": 0.0836,
451
  "step": 750
452
  },
453
  {
454
  "epoch": 4.437869822485207,
455
- "eval_loss": 1.0543982982635498,
456
- "eval_runtime": 31.6468,
457
- "eval_samples_per_second": 2.812,
458
- "eval_steps_per_second": 0.379,
459
  "step": 750
460
  },
461
  {
462
  "epoch": 4.585798816568047,
463
- "grad_norm": 0.6152017712593079,
464
  "learning_rate": 4.5045045045045046e-05,
465
- "loss": 0.0842,
466
  "step": 775
467
  },
468
  {
469
  "epoch": 4.585798816568047,
470
- "eval_loss": 1.0431654453277588,
471
- "eval_runtime": 31.6364,
472
- "eval_samples_per_second": 2.813,
473
- "eval_steps_per_second": 0.379,
474
  "step": 775
475
  },
476
  {
477
  "epoch": 4.733727810650888,
478
- "grad_norm": 0.5139034986495972,
479
  "learning_rate": 4.0040040040040046e-05,
480
- "loss": 0.081,
481
  "step": 800
482
  },
483
  {
484
  "epoch": 4.733727810650888,
485
- "eval_loss": 1.0386168956756592,
486
- "eval_runtime": 31.7013,
487
- "eval_samples_per_second": 2.807,
488
- "eval_steps_per_second": 0.379,
489
  "step": 800
490
  },
491
  {
492
  "epoch": 4.881656804733728,
493
- "grad_norm": 0.7744113206863403,
494
  "learning_rate": 3.503503503503503e-05,
495
- "loss": 0.0848,
496
  "step": 825
497
  },
498
  {
499
  "epoch": 4.881656804733728,
500
- "eval_loss": 1.0704792737960815,
501
- "eval_runtime": 31.6705,
502
- "eval_samples_per_second": 2.81,
503
- "eval_steps_per_second": 0.379,
504
  "step": 825
505
  },
506
  {
507
  "epoch": 5.029585798816568,
508
- "grad_norm": 0.4454633593559265,
509
  "learning_rate": 3.0030030030030033e-05,
510
- "loss": 0.0776,
511
  "step": 850
512
  },
513
  {
514
  "epoch": 5.029585798816568,
515
- "eval_loss": 1.0772627592086792,
516
- "eval_runtime": 31.7163,
517
- "eval_samples_per_second": 2.806,
518
- "eval_steps_per_second": 0.378,
519
  "step": 850
520
  },
521
  {
522
  "epoch": 5.177514792899408,
523
- "grad_norm": 0.45951634645462036,
524
  "learning_rate": 2.502502502502503e-05,
525
- "loss": 0.0485,
526
  "step": 875
527
  },
528
  {
529
  "epoch": 5.177514792899408,
530
- "eval_loss": 1.190962553024292,
531
- "eval_runtime": 31.6581,
532
- "eval_samples_per_second": 2.811,
533
- "eval_steps_per_second": 0.379,
534
  "step": 875
535
  },
536
  {
537
  "epoch": 5.325443786982248,
538
- "grad_norm": 0.49803122878074646,
539
  "learning_rate": 2.0020020020020023e-05,
540
- "loss": 0.0481,
541
  "step": 900
542
  },
543
  {
544
  "epoch": 5.325443786982248,
545
- "eval_loss": 1.1799925565719604,
546
- "eval_runtime": 31.6457,
547
- "eval_samples_per_second": 2.812,
548
- "eval_steps_per_second": 0.379,
549
  "step": 900
550
  },
551
  {
552
  "epoch": 5.4733727810650885,
553
- "grad_norm": 0.48808640241622925,
554
  "learning_rate": 1.5015015015015016e-05,
555
- "loss": 0.0481,
556
  "step": 925
557
  },
558
  {
559
  "epoch": 5.4733727810650885,
560
- "eval_loss": 1.1915431022644043,
561
- "eval_runtime": 31.641,
562
- "eval_samples_per_second": 2.813,
563
- "eval_steps_per_second": 0.379,
564
  "step": 925
565
  },
566
  {
567
  "epoch": 5.621301775147929,
568
- "grad_norm": 0.4711610972881317,
569
  "learning_rate": 1.0010010010010011e-05,
570
- "loss": 0.0467,
571
  "step": 950
572
  },
573
  {
574
  "epoch": 5.621301775147929,
575
- "eval_loss": 1.1996334791183472,
576
- "eval_runtime": 31.634,
577
- "eval_samples_per_second": 2.813,
578
- "eval_steps_per_second": 0.379,
579
  "step": 950
580
  },
581
  {
582
  "epoch": 5.769230769230769,
583
- "grad_norm": 0.4745465815067291,
584
  "learning_rate": 5.005005005005006e-06,
585
- "loss": 0.0468,
586
  "step": 975
587
  },
588
  {
589
  "epoch": 5.769230769230769,
590
- "eval_loss": 1.1960943937301636,
591
- "eval_runtime": 31.6459,
592
- "eval_samples_per_second": 2.812,
593
- "eval_steps_per_second": 0.379,
594
  "step": 975
595
  },
596
  {
597
  "epoch": 5.9171597633136095,
598
- "grad_norm": 0.44335442781448364,
599
  "learning_rate": 0.0,
600
- "loss": 0.0449,
601
  "step": 1000
602
  },
603
  {
604
  "epoch": 5.9171597633136095,
605
- "eval_loss": 1.2003010511398315,
606
- "eval_runtime": 31.6446,
607
- "eval_samples_per_second": 2.812,
608
- "eval_steps_per_second": 0.379,
609
  "step": 1000
610
  },
611
  {
612
- "epoch": 5.923076923076923,
613
- "step": 1001,
614
- "total_flos": 1.7606154086724403e+17,
615
- "train_loss": 4.533969319902815e-05,
616
- "train_runtime": 4.6313,
617
- "train_samples_per_second": 863.692,
618
- "train_steps_per_second": 215.923
619
  }
620
  ],
621
  "logging_steps": 25,
@@ -623,7 +623,7 @@
623
  "num_input_tokens_seen": 0,
624
  "num_train_epochs": 6,
625
  "save_steps": 25,
626
- "total_flos": 1.7606154086724403e+17,
627
  "train_batch_size": 1,
628
  "trial_name": null,
629
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.9171597633136095,
5
  "eval_steps": 25,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.14792899408284024,
13
+ "grad_norm": 0.98828125,
14
  "learning_rate": 0.0001951951951951952,
15
+ "loss": 0.9391,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.14792899408284024,
20
+ "eval_loss": 0.6652668118476868,
21
+ "eval_runtime": 5.3863,
22
+ "eval_samples_per_second": 16.523,
23
+ "eval_steps_per_second": 2.228,
24
  "step": 25
25
  },
26
  {
27
  "epoch": 0.2958579881656805,
28
+ "grad_norm": 0.654296875,
29
  "learning_rate": 0.0001901901901901902,
30
+ "loss": 0.6138,
31
  "step": 50
32
  },
33
  {
34
  "epoch": 0.2958579881656805,
35
+ "eval_loss": 0.6126009225845337,
36
+ "eval_runtime": 5.4512,
37
+ "eval_samples_per_second": 16.327,
38
+ "eval_steps_per_second": 2.201,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.4437869822485207,
43
+ "grad_norm": 0.7314453125,
44
  "learning_rate": 0.0001851851851851852,
45
+ "loss": 0.6039,
46
  "step": 75
47
  },
48
  {
49
  "epoch": 0.4437869822485207,
50
+ "eval_loss": 0.6061152219772339,
51
+ "eval_runtime": 5.4459,
52
+ "eval_samples_per_second": 16.343,
53
+ "eval_steps_per_second": 2.203,
54
  "step": 75
55
  },
56
  {
57
  "epoch": 0.591715976331361,
58
+ "grad_norm": 0.65869140625,
59
  "learning_rate": 0.00018018018018018018,
60
+ "loss": 0.5927,
61
  "step": 100
62
  },
63
  {
64
  "epoch": 0.591715976331361,
65
+ "eval_loss": 0.5998491644859314,
66
+ "eval_runtime": 5.4564,
67
+ "eval_samples_per_second": 16.311,
68
+ "eval_steps_per_second": 2.199,
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.7396449704142012,
73
+ "grad_norm": 0.67333984375,
74
  "learning_rate": 0.0001751751751751752,
75
+ "loss": 0.5973,
76
  "step": 125
77
  },
78
  {
79
  "epoch": 0.7396449704142012,
80
+ "eval_loss": 0.594585120677948,
81
+ "eval_runtime": 5.4901,
82
+ "eval_samples_per_second": 16.211,
83
+ "eval_steps_per_second": 2.186,
84
  "step": 125
85
  },
86
  {
87
  "epoch": 0.8875739644970414,
88
+ "grad_norm": 0.65380859375,
89
  "learning_rate": 0.0001701701701701702,
90
+ "loss": 0.602,
91
  "step": 150
92
  },
93
  {
94
  "epoch": 0.8875739644970414,
95
+ "eval_loss": 0.5942851305007935,
96
+ "eval_runtime": 5.4552,
97
+ "eval_samples_per_second": 16.315,
98
+ "eval_steps_per_second": 2.2,
99
  "step": 150
100
  },
101
  {
102
  "epoch": 1.0355029585798816,
103
+ "grad_norm": 0.58544921875,
104
  "learning_rate": 0.00016516516516516518,
105
+ "loss": 0.547,
106
  "step": 175
107
  },
108
  {
109
  "epoch": 1.0355029585798816,
110
+ "eval_loss": 0.6319454312324524,
111
+ "eval_runtime": 5.4449,
112
+ "eval_samples_per_second": 16.345,
113
+ "eval_steps_per_second": 2.204,
114
  "step": 175
115
  },
116
  {
117
  "epoch": 1.183431952662722,
118
+ "grad_norm": 0.62158203125,
119
  "learning_rate": 0.00016016016016016018,
120
+ "loss": 0.4239,
121
  "step": 200
122
  },
123
  {
124
  "epoch": 1.183431952662722,
125
+ "eval_loss": 0.6168724894523621,
126
+ "eval_runtime": 5.4499,
127
+ "eval_samples_per_second": 16.331,
128
+ "eval_steps_per_second": 2.202,
129
  "step": 200
130
  },
131
  {
132
  "epoch": 1.331360946745562,
133
+ "grad_norm": 0.71240234375,
134
  "learning_rate": 0.00015515515515515516,
135
+ "loss": 0.4301,
136
  "step": 225
137
  },
138
  {
139
  "epoch": 1.331360946745562,
140
+ "eval_loss": 0.615761935710907,
141
+ "eval_runtime": 5.4932,
142
+ "eval_samples_per_second": 16.202,
143
+ "eval_steps_per_second": 2.185,
144
  "step": 225
145
  },
146
  {
147
  "epoch": 1.4792899408284024,
148
+ "grad_norm": 0.6865234375,
149
  "learning_rate": 0.00015015015015015014,
150
+ "loss": 0.4176,
151
  "step": 250
152
  },
153
  {
154
  "epoch": 1.4792899408284024,
155
+ "eval_loss": 0.6192708611488342,
156
+ "eval_runtime": 5.4548,
157
+ "eval_samples_per_second": 16.316,
158
+ "eval_steps_per_second": 2.2,
159
  "step": 250
160
  },
161
  {
162
  "epoch": 1.6272189349112427,
163
+ "grad_norm": 0.8076171875,
164
  "learning_rate": 0.00014514514514514515,
165
+ "loss": 0.4295,
166
  "step": 275
167
  },
168
  {
169
  "epoch": 1.6272189349112427,
170
+ "eval_loss": 0.6242427229881287,
171
+ "eval_runtime": 5.4583,
172
+ "eval_samples_per_second": 16.305,
173
+ "eval_steps_per_second": 2.198,
174
  "step": 275
175
  },
176
  {
177
  "epoch": 1.7751479289940828,
178
+ "grad_norm": 0.66796875,
179
  "learning_rate": 0.00014014014014014013,
180
+ "loss": 0.4252,
181
  "step": 300
182
  },
183
  {
184
  "epoch": 1.7751479289940828,
185
+ "eval_loss": 0.6264795660972595,
186
+ "eval_runtime": 5.4513,
187
+ "eval_samples_per_second": 16.326,
188
+ "eval_steps_per_second": 2.201,
189
  "step": 300
190
  },
191
  {
192
  "epoch": 1.9230769230769231,
193
+ "grad_norm": 0.720703125,
194
  "learning_rate": 0.00013513513513513514,
195
+ "loss": 0.4252,
196
  "step": 325
197
  },
198
  {
199
  "epoch": 1.9230769230769231,
200
+ "eval_loss": 0.6264156103134155,
201
+ "eval_runtime": 5.4759,
202
+ "eval_samples_per_second": 16.253,
203
+ "eval_steps_per_second": 2.191,
204
  "step": 325
205
  },
206
  {
207
  "epoch": 2.0710059171597632,
208
+ "grad_norm": 0.76611328125,
209
  "learning_rate": 0.00013013013013013014,
210
+ "loss": 0.3591,
211
  "step": 350
212
  },
213
  {
214
  "epoch": 2.0710059171597632,
215
+ "eval_loss": 0.6893021464347839,
216
+ "eval_runtime": 5.4744,
217
+ "eval_samples_per_second": 16.258,
218
+ "eval_steps_per_second": 2.192,
219
  "step": 350
220
  },
221
  {
222
  "epoch": 2.2189349112426036,
223
+ "grad_norm": 0.74169921875,
224
  "learning_rate": 0.00012512512512512512,
225
+ "loss": 0.2758,
226
  "step": 375
227
  },
228
  {
229
  "epoch": 2.2189349112426036,
230
+ "eval_loss": 0.7153319716453552,
231
+ "eval_runtime": 5.504,
232
+ "eval_samples_per_second": 16.17,
233
+ "eval_steps_per_second": 2.18,
234
  "step": 375
235
  },
236
  {
237
  "epoch": 2.366863905325444,
238
+ "grad_norm": 0.69384765625,
239
  "learning_rate": 0.00012012012012012013,
240
+ "loss": 0.2702,
241
  "step": 400
242
  },
243
  {
244
  "epoch": 2.366863905325444,
245
+ "eval_loss": 0.7170297503471375,
246
+ "eval_runtime": 5.4565,
247
+ "eval_samples_per_second": 16.311,
248
+ "eval_steps_per_second": 2.199,
249
  "step": 400
250
  },
251
  {
252
  "epoch": 2.5147928994082838,
253
+ "grad_norm": 0.806640625,
254
  "learning_rate": 0.00011511511511511512,
255
+ "loss": 0.2797,
256
  "step": 425
257
  },
258
  {
259
  "epoch": 2.5147928994082838,
260
+ "eval_loss": 0.7173412442207336,
261
+ "eval_runtime": 5.4741,
262
+ "eval_samples_per_second": 16.258,
263
+ "eval_steps_per_second": 2.192,
264
  "step": 425
265
  },
266
  {
267
  "epoch": 2.662721893491124,
268
+ "grad_norm": 0.77099609375,
269
  "learning_rate": 0.00011011011011011012,
270
+ "loss": 0.2727,
271
  "step": 450
272
  },
273
  {
274
  "epoch": 2.662721893491124,
275
+ "eval_loss": 0.7144489288330078,
276
+ "eval_runtime": 5.5009,
277
+ "eval_samples_per_second": 16.179,
278
+ "eval_steps_per_second": 2.181,
279
  "step": 450
280
  },
281
  {
282
  "epoch": 2.8106508875739644,
283
+ "grad_norm": 42.5625,
284
  "learning_rate": 0.00010510510510510511,
285
+ "loss": 0.2817,
286
  "step": 475
287
  },
288
  {
289
  "epoch": 2.8106508875739644,
290
+ "eval_loss": 0.7168906331062317,
291
+ "eval_runtime": 5.4533,
292
+ "eval_samples_per_second": 16.32,
293
+ "eval_steps_per_second": 2.201,
294
  "step": 475
295
  },
296
  {
297
  "epoch": 2.9585798816568047,
298
+ "grad_norm": 0.724609375,
299
  "learning_rate": 0.00010010010010010012,
300
+ "loss": 0.2798,
301
  "step": 500
302
  },
303
  {
304
  "epoch": 2.9585798816568047,
305
+ "eval_loss": 0.7015586495399475,
306
+ "eval_runtime": 5.467,
307
+ "eval_samples_per_second": 16.28,
308
+ "eval_steps_per_second": 2.195,
309
  "step": 500
310
  },
311
  {
312
  "epoch": 3.106508875739645,
313
+ "grad_norm": 0.6162109375,
314
  "learning_rate": 9.50950950950951e-05,
315
+ "loss": 0.1922,
316
  "step": 525
317
  },
318
  {
319
  "epoch": 3.106508875739645,
320
+ "eval_loss": 0.8090196847915649,
321
+ "eval_runtime": 5.458,
322
+ "eval_samples_per_second": 16.306,
323
+ "eval_steps_per_second": 2.199,
324
  "step": 525
325
  },
326
  {
327
  "epoch": 3.2544378698224854,
328
+ "grad_norm": 0.80517578125,
329
  "learning_rate": 9.009009009009009e-05,
330
+ "loss": 0.16,
331
  "step": 550
332
  },
333
  {
334
  "epoch": 3.2544378698224854,
335
+ "eval_loss": 0.8372513651847839,
336
+ "eval_runtime": 5.4975,
337
+ "eval_samples_per_second": 16.189,
338
+ "eval_steps_per_second": 2.183,
339
  "step": 550
340
  },
341
  {
342
  "epoch": 3.4023668639053253,
343
+ "grad_norm": 0.71728515625,
344
  "learning_rate": 8.50850850850851e-05,
345
+ "loss": 0.1623,
346
  "step": 575
347
  },
348
  {
349
  "epoch": 3.4023668639053253,
350
+ "eval_loss": 0.8371546864509583,
351
+ "eval_runtime": 5.4897,
352
+ "eval_samples_per_second": 16.212,
353
+ "eval_steps_per_second": 2.186,
354
  "step": 575
355
  },
356
  {
357
  "epoch": 3.5502958579881656,
358
+ "grad_norm": 0.775390625,
359
  "learning_rate": 8.008008008008009e-05,
360
+ "loss": 0.1632,
361
  "step": 600
362
  },
363
  {
364
  "epoch": 3.5502958579881656,
365
+ "eval_loss": 0.8401942849159241,
366
+ "eval_runtime": 5.4525,
367
+ "eval_samples_per_second": 16.323,
368
+ "eval_steps_per_second": 2.201,
369
  "step": 600
370
  },
371
  {
372
  "epoch": 3.698224852071006,
373
+ "grad_norm": 0.96337890625,
374
  "learning_rate": 7.507507507507507e-05,
375
+ "loss": 0.1618,
376
  "step": 625
377
  },
378
  {
379
  "epoch": 3.698224852071006,
380
+ "eval_loss": 0.8558365106582642,
381
+ "eval_runtime": 5.4558,
382
+ "eval_samples_per_second": 16.313,
383
+ "eval_steps_per_second": 2.199,
384
  "step": 625
385
  },
386
  {
387
  "epoch": 3.8461538461538463,
388
+ "grad_norm": 0.80322265625,
389
  "learning_rate": 7.007007007007007e-05,
390
+ "loss": 0.1732,
391
  "step": 650
392
  },
393
  {
394
  "epoch": 3.8461538461538463,
395
+ "eval_loss": 0.8581485748291016,
396
+ "eval_runtime": 5.4935,
397
+ "eval_samples_per_second": 16.201,
398
+ "eval_steps_per_second": 2.184,
399
  "step": 650
400
  },
401
  {
402
  "epoch": 3.994082840236686,
403
+ "grad_norm": 0.85498046875,
404
  "learning_rate": 6.506506506506507e-05,
405
+ "loss": 0.1687,
406
  "step": 675
407
  },
408
  {
409
  "epoch": 3.994082840236686,
410
+ "eval_loss": 0.8611082434654236,
411
+ "eval_runtime": 5.4485,
412
+ "eval_samples_per_second": 16.335,
413
+ "eval_steps_per_second": 2.202,
414
  "step": 675
415
  },
416
  {
417
  "epoch": 4.1420118343195265,
418
+ "grad_norm": 0.5654296875,
419
  "learning_rate": 6.0060060060060066e-05,
420
+ "loss": 0.0961,
421
  "step": 700
422
  },
423
  {
424
  "epoch": 4.1420118343195265,
425
+ "eval_loss": 0.9902079105377197,
426
+ "eval_runtime": 5.519,
427
+ "eval_samples_per_second": 16.126,
428
+ "eval_steps_per_second": 2.174,
429
  "step": 700
430
  },
431
  {
432
  "epoch": 4.289940828402367,
433
+ "grad_norm": 0.560546875,
434
  "learning_rate": 5.505505505505506e-05,
435
+ "loss": 0.0879,
436
  "step": 725
437
  },
438
  {
439
  "epoch": 4.289940828402367,
440
+ "eval_loss": 1.0101935863494873,
441
+ "eval_runtime": 5.4771,
442
+ "eval_samples_per_second": 16.25,
443
+ "eval_steps_per_second": 2.191,
444
  "step": 725
445
  },
446
  {
447
  "epoch": 4.437869822485207,
448
+ "grad_norm": 0.76611328125,
449
  "learning_rate": 5.005005005005006e-05,
450
+ "loss": 0.0899,
451
  "step": 750
452
  },
453
  {
454
  "epoch": 4.437869822485207,
455
+ "eval_loss": 1.0344929695129395,
456
+ "eval_runtime": 5.4997,
457
+ "eval_samples_per_second": 16.183,
458
+ "eval_steps_per_second": 2.182,
459
  "step": 750
460
  },
461
  {
462
  "epoch": 4.585798816568047,
463
+ "grad_norm": 0.595703125,
464
  "learning_rate": 4.5045045045045046e-05,
465
+ "loss": 0.0899,
466
  "step": 775
467
  },
468
  {
469
  "epoch": 4.585798816568047,
470
+ "eval_loss": 1.0255744457244873,
471
+ "eval_runtime": 5.4646,
472
+ "eval_samples_per_second": 16.287,
473
+ "eval_steps_per_second": 2.196,
474
  "step": 775
475
  },
476
  {
477
  "epoch": 4.733727810650888,
478
+ "grad_norm": 0.5869140625,
479
  "learning_rate": 4.0040040040040046e-05,
480
+ "loss": 0.0882,
481
  "step": 800
482
  },
483
  {
484
  "epoch": 4.733727810650888,
485
+ "eval_loss": 1.0273164510726929,
486
+ "eval_runtime": 5.4989,
487
+ "eval_samples_per_second": 16.185,
488
+ "eval_steps_per_second": 2.182,
489
  "step": 800
490
  },
491
  {
492
  "epoch": 4.881656804733728,
493
+ "grad_norm": 0.720703125,
494
  "learning_rate": 3.503503503503503e-05,
495
+ "loss": 0.0893,
496
  "step": 825
497
  },
498
  {
499
  "epoch": 4.881656804733728,
500
+ "eval_loss": 1.0559364557266235,
501
+ "eval_runtime": 5.4574,
502
+ "eval_samples_per_second": 16.308,
503
+ "eval_steps_per_second": 2.199,
504
  "step": 825
505
  },
506
  {
507
  "epoch": 5.029585798816568,
508
+ "grad_norm": 0.4755859375,
509
  "learning_rate": 3.0030030030030033e-05,
510
+ "loss": 0.0824,
511
  "step": 850
512
  },
513
  {
514
  "epoch": 5.029585798816568,
515
+ "eval_loss": 1.0753172636032104,
516
+ "eval_runtime": 5.5098,
517
+ "eval_samples_per_second": 16.153,
518
+ "eval_steps_per_second": 2.178,
519
  "step": 850
520
  },
521
  {
522
  "epoch": 5.177514792899408,
523
+ "grad_norm": 0.50439453125,
524
  "learning_rate": 2.502502502502503e-05,
525
+ "loss": 0.052,
526
  "step": 875
527
  },
528
  {
529
  "epoch": 5.177514792899408,
530
+ "eval_loss": 1.158236026763916,
531
+ "eval_runtime": 5.4641,
532
+ "eval_samples_per_second": 16.288,
533
+ "eval_steps_per_second": 2.196,
534
  "step": 875
535
  },
536
  {
537
  "epoch": 5.325443786982248,
538
+ "grad_norm": 0.468994140625,
539
  "learning_rate": 2.0020020020020023e-05,
540
+ "loss": 0.052,
541
  "step": 900
542
  },
543
  {
544
  "epoch": 5.325443786982248,
545
+ "eval_loss": 1.164330005645752,
546
+ "eval_runtime": 5.4588,
547
+ "eval_samples_per_second": 16.304,
548
+ "eval_steps_per_second": 2.198,
549
  "step": 900
550
  },
551
  {
552
  "epoch": 5.4733727810650885,
553
+ "grad_norm": 0.5849609375,
554
  "learning_rate": 1.5015015015015016e-05,
555
+ "loss": 0.0526,
556
  "step": 925
557
  },
558
  {
559
  "epoch": 5.4733727810650885,
560
+ "eval_loss": 1.1923322677612305,
561
+ "eval_runtime": 5.5009,
562
+ "eval_samples_per_second": 16.179,
563
+ "eval_steps_per_second": 2.181,
564
  "step": 925
565
  },
566
  {
567
  "epoch": 5.621301775147929,
568
+ "grad_norm": 0.52783203125,
569
  "learning_rate": 1.0010010010010011e-05,
570
+ "loss": 0.0497,
571
  "step": 950
572
  },
573
  {
574
  "epoch": 5.621301775147929,
575
+ "eval_loss": 1.175872802734375,
576
+ "eval_runtime": 5.4976,
577
+ "eval_samples_per_second": 16.189,
578
+ "eval_steps_per_second": 2.183,
579
  "step": 950
580
  },
581
  {
582
  "epoch": 5.769230769230769,
583
+ "grad_norm": 0.461669921875,
584
  "learning_rate": 5.005005005005006e-06,
585
+ "loss": 0.0496,
586
  "step": 975
587
  },
588
  {
589
  "epoch": 5.769230769230769,
590
+ "eval_loss": 1.1811896562576294,
591
+ "eval_runtime": 5.4611,
592
+ "eval_samples_per_second": 16.297,
593
+ "eval_steps_per_second": 2.197,
594
  "step": 975
595
  },
596
  {
597
  "epoch": 5.9171597633136095,
598
+ "grad_norm": 0.487548828125,
599
  "learning_rate": 0.0,
600
+ "loss": 0.0477,
601
  "step": 1000
602
  },
603
  {
604
  "epoch": 5.9171597633136095,
605
+ "eval_loss": 1.1831614971160889,
606
+ "eval_runtime": 5.452,
607
+ "eval_samples_per_second": 16.324,
608
+ "eval_steps_per_second": 2.201,
609
  "step": 1000
610
  },
611
  {
612
+ "epoch": 5.9171597633136095,
613
+ "step": 1000,
614
+ "total_flos": 1.75885655212032e+17,
615
+ "train_loss": 0.2793775268793106,
616
+ "train_runtime": 1218.5958,
617
+ "train_samples_per_second": 3.282,
618
+ "train_steps_per_second": 0.821
619
  }
620
  ],
621
  "logging_steps": 25,
 
623
  "num_input_tokens_seen": 0,
624
  "num_train_epochs": 6,
625
  "save_steps": 25,
626
+ "total_flos": 1.75885655212032e+17,
627
  "train_batch_size": 1,
628
  "trial_name": null,
629
  "trial_params": null