SriSanth2345 commited on
Commit
27083c4
1 Parent(s): dcdfdd3

Upload 16 files

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: mistralai/Mistral-7B-Instruct-v0.1
3
+ library_name: peft
4
+ license: other
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: mistral_physs
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # mistral_physs
18
+
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the physics dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 3e-05
39
+ - train_batch_size: 8
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 8
43
+ - total_train_batch_size: 64
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 1.0
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+
52
+
53
+ ### Framework versions
54
+
55
+ - PEFT 0.12.0
56
+ - Transformers 4.43.4
57
+ - Pytorch 2.2.1
58
+ - Datasets 2.21.0
59
+ - Tokenizers 0.19.1
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "down_proj",
25
+ "up_proj",
26
+ "k_proj",
27
+ "o_proj",
28
+ "gate_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38971402ace8dfaa4c171680108f01323ecd1d7ef217e9b4e01921d67dac59ac
3
+ size 83945296
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9984,
3
+ "num_input_tokens_seen": 5111808,
4
+ "total_flos": 2.1873318928633037e+17,
5
+ "train_loss": 0.5760276004289969,
6
+ "train_runtime": 1768.2955,
7
+ "train_samples_per_second": 5.655,
8
+ "train_steps_per_second": 0.088
9
+ }
llamaboard_config.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top.booster: auto
2
+ top.checkpoint_path: []
3
+ top.finetuning_type: lora
4
+ top.model_name: Mistral-7B-v0.1-Chat
5
+ top.quantization_bit: none
6
+ top.quantization_method: bitsandbytes
7
+ top.rope_scaling: none
8
+ top.template: mistral
9
+ top.visual_inputs: false
10
+ train.additional_target: ''
11
+ train.badam_mode: layer
12
+ train.badam_switch_interval: 50
13
+ train.badam_switch_mode: ascending
14
+ train.badam_update_ratio: 0.05
15
+ train.batch_size: 8
16
+ train.compute_type: fp16
17
+ train.create_new_adapter: false
18
+ train.cutoff_len: 512
19
+ train.dataset:
20
+ - physics
21
+ train.dataset_dir: data
22
+ train.ds_offload: false
23
+ train.ds_stage: none
24
+ train.freeze_extra_modules: ''
25
+ train.freeze_trainable_layers: 2
26
+ train.freeze_trainable_modules: all
27
+ train.galore_rank: 16
28
+ train.galore_scale: 0.25
29
+ train.galore_target: all
30
+ train.galore_update_interval: 200
31
+ train.gradient_accumulation_steps: 8
32
+ train.learning_rate: 3e-5
33
+ train.logging_steps: 5
34
+ train.lora_alpha: 16
35
+ train.lora_dropout: 0
36
+ train.lora_rank: 8
37
+ train.lora_target: ''
38
+ train.loraplus_lr_ratio: 0
39
+ train.lr_scheduler_type: cosine
40
+ train.mask_history: false
41
+ train.max_grad_norm: '1.0'
42
+ train.max_samples: '10000'
43
+ train.neat_packing: false
44
+ train.neftune_alpha: 0
45
+ train.num_train_epochs: '1.0'
46
+ train.optim: adamw_torch
47
+ train.packing: false
48
+ train.ppo_score_norm: false
49
+ train.ppo_whiten_rewards: false
50
+ train.pref_beta: 0.1
51
+ train.pref_ftx: 0
52
+ train.pref_loss: sigmoid
53
+ train.report_to: false
54
+ train.resize_vocab: false
55
+ train.reward_model: null
56
+ train.save_steps: 100
57
+ train.shift_attn: false
58
+ train.train_on_prompt: false
59
+ train.training_stage: Supervised Fine-Tuning
60
+ train.use_badam: false
61
+ train.use_dora: false
62
+ train.use_galore: false
63
+ train.use_llama_pro: false
64
+ train.use_pissa: false
65
+ train.use_rslora: false
66
+ train.val_size: 0
67
+ train.warmup_steps: 0
running_log.txt ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO|parser.py:355] 2024-08-29 20:29:00,103 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
2
+
3
+ [INFO|tokenization_utils_base.py:2289] 2024-08-29 20:29:00,279 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/tokenizer.model
4
+
5
+ [INFO|tokenization_utils_base.py:2289] 2024-08-29 20:29:00,280 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/tokenizer.json
6
+
7
+ [INFO|tokenization_utils_base.py:2289] 2024-08-29 20:29:00,280 >> loading file added_tokens.json from cache at None
8
+
9
+ [INFO|tokenization_utils_base.py:2289] 2024-08-29 20:29:00,280 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/special_tokens_map.json
10
+
11
+ [INFO|tokenization_utils_base.py:2289] 2024-08-29 20:29:00,280 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/tokenizer_config.json
12
+
13
+ [INFO|template.py:373] 2024-08-29 20:29:00,325 >> Add pad token: </s>
14
+
15
+ [INFO|loader.py:52] 2024-08-29 20:29:00,326 >> Loading dataset HydraLM/physics_dataset_alpaca...
16
+
17
+ [INFO|configuration_utils.py:733] 2024-08-29 20:29:03,695 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/config.json
18
+
19
+ [INFO|configuration_utils.py:800] 2024-08-29 20:29:03,696 >> Model config MistralConfig {
20
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
21
+ "architectures": [
22
+ "MistralForCausalLM"
23
+ ],
24
+ "attention_dropout": 0.0,
25
+ "bos_token_id": 1,
26
+ "eos_token_id": 2,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 4096,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 14336,
32
+ "max_position_embeddings": 32768,
33
+ "model_type": "mistral",
34
+ "num_attention_heads": 32,
35
+ "num_hidden_layers": 32,
36
+ "num_key_value_heads": 8,
37
+ "rms_norm_eps": 1e-05,
38
+ "rope_theta": 10000.0,
39
+ "sliding_window": 4096,
40
+ "tie_word_embeddings": false,
41
+ "torch_dtype": "bfloat16",
42
+ "transformers_version": "4.43.4",
43
+ "use_cache": true,
44
+ "vocab_size": 32000
45
+ }
46
+
47
+
48
+ [INFO|modeling_utils.py:3644] 2024-08-29 20:29:03,719 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/model.safetensors.index.json
49
+
50
+ [INFO|modeling_utils.py:1572] 2024-08-29 20:29:03,720 >> Instantiating MistralForCausalLM model under default dtype torch.float16.
51
+
52
+ [INFO|configuration_utils.py:1038] 2024-08-29 20:29:03,721 >> Generate config GenerationConfig {
53
+ "bos_token_id": 1,
54
+ "eos_token_id": 2
55
+ }
56
+
57
+
58
+ [INFO|modeling_utils.py:4473] 2024-08-29 20:29:30,412 >> All model checkpoint weights were used when initializing MistralForCausalLM.
59
+
60
+
61
+ [INFO|modeling_utils.py:4481] 2024-08-29 20:29:30,412 >> All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1.
62
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
63
+
64
+ [INFO|configuration_utils.py:993] 2024-08-29 20:29:30,511 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/generation_config.json
65
+
66
+ [INFO|configuration_utils.py:1038] 2024-08-29 20:29:30,512 >> Generate config GenerationConfig {
67
+ "bos_token_id": 1,
68
+ "eos_token_id": 2
69
+ }
70
+
71
+
72
+ [INFO|checkpointing.py:103] 2024-08-29 20:29:30,518 >> Gradient checkpointing enabled.
73
+
74
+ [INFO|attention.py:84] 2024-08-29 20:29:30,519 >> Using torch SDPA for faster training and inference.
75
+
76
+ [INFO|adapter.py:302] 2024-08-29 20:29:30,519 >> Upcasting trainable params to float32.
77
+
78
+ [INFO|adapter.py:158] 2024-08-29 20:29:30,519 >> Fine-tuning method: LoRA
79
+
80
+ [INFO|misc.py:51] 2024-08-29 20:29:30,519 >> Found linear modules: q_proj,down_proj,up_proj,k_proj,o_proj,gate_proj,v_proj
81
+
82
+ [INFO|loader.py:196] 2024-08-29 20:29:46,150 >> trainable params: 20,971,520 || all params: 7,262,703,616 || trainable%: 0.2888
83
+
84
+ [INFO|trainer.py:648] 2024-08-29 20:29:46,160 >> Using auto half precision backend
85
+
86
+ [INFO|trainer.py:2134] 2024-08-29 20:29:46,579 >> ***** Running training *****
87
+
88
+ [INFO|trainer.py:2135] 2024-08-29 20:29:46,579 >> Num examples = 10,000
89
+
90
+ [INFO|trainer.py:2136] 2024-08-29 20:29:46,579 >> Num Epochs = 1
91
+
92
+ [INFO|trainer.py:2137] 2024-08-29 20:29:46,579 >> Instantaneous batch size per device = 8
93
+
94
+ [INFO|trainer.py:2140] 2024-08-29 20:29:46,579 >> Total train batch size (w. parallel, distributed & accumulation) = 64
95
+
96
+ [INFO|trainer.py:2141] 2024-08-29 20:29:46,579 >> Gradient Accumulation steps = 8
97
+
98
+ [INFO|trainer.py:2142] 2024-08-29 20:29:46,579 >> Total optimization steps = 156
99
+
100
+ [INFO|trainer.py:2143] 2024-08-29 20:29:46,584 >> Number of trainable parameters = 20,971,520
101
+
102
+ [INFO|callbacks.py:319] 2024-08-29 20:30:43,585 >> {'loss': 0.6780, 'learning_rate': 2.9924e-05, 'epoch': 0.03, 'throughput': 2874.51}
103
+
104
+ [INFO|callbacks.py:319] 2024-08-29 20:31:40,281 >> {'loss': 0.6551, 'learning_rate': 2.9697e-05, 'epoch': 0.06, 'throughput': 2882.14}
105
+
106
+ [INFO|callbacks.py:319] 2024-08-29 20:32:36,903 >> {'loss': 0.6372, 'learning_rate': 2.9321e-05, 'epoch': 0.10, 'throughput': 2885.94}
107
+
108
+ [INFO|callbacks.py:319] 2024-08-29 20:33:33,631 >> {'loss': 0.6201, 'learning_rate': 2.8800e-05, 'epoch': 0.13, 'throughput': 2886.5}
109
+
110
+ [INFO|callbacks.py:319] 2024-08-29 20:34:30,081 >> {'loss': 0.5965, 'learning_rate': 2.8139e-05, 'epoch': 0.16, 'throughput': 2889.67}
111
+
112
+ [INFO|callbacks.py:319] 2024-08-29 20:35:26,826 >> {'loss': 0.5959, 'learning_rate': 2.7345e-05, 'epoch': 0.19, 'throughput': 2889.27}
113
+
114
+ [INFO|callbacks.py:319] 2024-08-29 20:36:23,284 >> {'loss': 0.6006, 'learning_rate': 2.6426e-05, 'epoch': 0.22, 'throughput': 2891.08}
115
+
116
+ [INFO|callbacks.py:319] 2024-08-29 20:37:19,827 >> {'loss': 0.5940, 'learning_rate': 2.5391e-05, 'epoch': 0.26, 'throughput': 2891.9}
117
+
118
+ [INFO|callbacks.py:319] 2024-08-29 20:38:16,461 >> {'loss': 0.5825, 'learning_rate': 2.4251e-05, 'epoch': 0.29, 'throughput': 2892.01}
119
+
120
+ [INFO|callbacks.py:319] 2024-08-29 20:39:13,245 >> {'loss': 0.5871, 'learning_rate': 2.3017e-05, 'epoch': 0.32, 'throughput': 2891.34}
121
+
122
+ [INFO|callbacks.py:319] 2024-08-29 20:40:10,089 >> {'loss': 0.5771, 'learning_rate': 2.1702e-05, 'epoch': 0.35, 'throughput': 2890.51}
123
+
124
+ [INFO|callbacks.py:319] 2024-08-29 20:41:06,524 >> {'loss': 0.5544, 'learning_rate': 2.0319e-05, 'epoch': 0.38, 'throughput': 2891.57}
125
+
126
+ [INFO|callbacks.py:319] 2024-08-29 20:42:03,054 >> {'loss': 0.5768, 'learning_rate': 1.8882e-05, 'epoch': 0.42, 'throughput': 2892.08}
127
+
128
+ [INFO|callbacks.py:319] 2024-08-29 20:42:59,850 >> {'loss': 0.5534, 'learning_rate': 1.7406e-05, 'epoch': 0.45, 'throughput': 2891.55}
129
+
130
+ [INFO|callbacks.py:319] 2024-08-29 20:43:56,581 >> {'loss': 0.5597, 'learning_rate': 1.5906e-05, 'epoch': 0.48, 'throughput': 2891.32}
131
+
132
+ [INFO|callbacks.py:319] 2024-08-29 20:44:53,211 >> {'loss': 0.5710, 'learning_rate': 1.4396e-05, 'epoch': 0.51, 'throughput': 2891.43}
133
+
134
+ [INFO|callbacks.py:319] 2024-08-29 20:45:49,959 >> {'loss': 0.5572, 'learning_rate': 1.2892e-05, 'epoch': 0.54, 'throughput': 2891.18}
135
+
136
+ [INFO|callbacks.py:319] 2024-08-29 20:46:46,195 >> {'loss': 0.5686, 'learning_rate': 1.1410e-05, 'epoch': 0.58, 'throughput': 2892.41}
137
+
138
+ [INFO|callbacks.py:319] 2024-08-29 20:47:42,985 >> {'loss': 0.5525, 'learning_rate': 9.9644e-06, 'epoch': 0.61, 'throughput': 2892.02}
139
+
140
+ [INFO|callbacks.py:319] 2024-08-29 20:48:39,767 >> {'loss': 0.5594, 'learning_rate': 8.5696e-06, 'epoch': 0.64, 'throughput': 2891.69}
141
+
142
+ [INFO|trainer.py:3503] 2024-08-29 20:48:39,768 >> Saving model checkpoint to saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/checkpoint-100
143
+
144
+ [INFO|configuration_utils.py:733] 2024-08-29 20:48:40,107 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/config.json
145
+
146
+ [INFO|configuration_utils.py:800] 2024-08-29 20:48:40,107 >> Model config MistralConfig {
147
+ "architectures": [
148
+ "MistralForCausalLM"
149
+ ],
150
+ "attention_dropout": 0.0,
151
+ "bos_token_id": 1,
152
+ "eos_token_id": 2,
153
+ "head_dim": 128,
154
+ "hidden_act": "silu",
155
+ "hidden_size": 4096,
156
+ "initializer_range": 0.02,
157
+ "intermediate_size": 14336,
158
+ "max_position_embeddings": 32768,
159
+ "model_type": "mistral",
160
+ "num_attention_heads": 32,
161
+ "num_hidden_layers": 32,
162
+ "num_key_value_heads": 8,
163
+ "rms_norm_eps": 1e-05,
164
+ "rope_theta": 10000.0,
165
+ "sliding_window": 4096,
166
+ "tie_word_embeddings": false,
167
+ "torch_dtype": "bfloat16",
168
+ "transformers_version": "4.43.4",
169
+ "use_cache": true,
170
+ "vocab_size": 32000
171
+ }
172
+
173
+
174
+ [INFO|tokenization_utils_base.py:2702] 2024-08-29 20:48:40,206 >> tokenizer config file saved in saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/checkpoint-100/tokenizer_config.json
175
+
176
+ [INFO|tokenization_utils_base.py:2711] 2024-08-29 20:48:40,207 >> Special tokens file saved in saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/checkpoint-100/special_tokens_map.json
177
+
178
+ [INFO|callbacks.py:319] 2024-08-29 20:49:36,953 >> {'loss': 0.5709, 'learning_rate': 7.2399e-06, 'epoch': 0.67, 'throughput': 2890.41}
179
+
180
+ [INFO|callbacks.py:319] 2024-08-29 20:50:33,412 >> {'loss': 0.5555, 'learning_rate': 5.9889e-06, 'epoch': 0.70, 'throughput': 2890.93}
181
+
182
+ [INFO|callbacks.py:319] 2024-08-29 20:51:29,876 >> {'loss': 0.5597, 'learning_rate': 4.8291e-06, 'epoch': 0.74, 'throughput': 2891.4}
183
+
184
+ [INFO|callbacks.py:319] 2024-08-29 20:52:26,764 >> {'loss': 0.5432, 'learning_rate': 3.7723e-06, 'epoch': 0.77, 'throughput': 2890.92}
185
+
186
+ [INFO|callbacks.py:319] 2024-08-29 20:53:23,270 >> {'loss': 0.5363, 'learning_rate': 2.8293e-06, 'epoch': 0.80, 'throughput': 2891.26}
187
+
188
+ [INFO|callbacks.py:319] 2024-08-29 20:54:19,827 >> {'loss': 0.5582, 'learning_rate': 2.0096e-06, 'epoch': 0.83, 'throughput': 2891.48}
189
+
190
+ [INFO|callbacks.py:319] 2024-08-29 20:55:16,550 >> {'loss': 0.5539, 'learning_rate': 1.3215e-06, 'epoch': 0.86, 'throughput': 2891.37}
191
+
192
+ [INFO|callbacks.py:319] 2024-08-29 20:56:13,084 >> {'loss': 0.5529, 'learning_rate': 7.7195e-07, 'epoch': 0.90, 'throughput': 2891.6}
193
+
194
+ [INFO|callbacks.py:319] 2024-08-29 20:57:09,602 >> {'loss': 0.5584, 'learning_rate': 3.6654e-07, 'epoch': 0.93, 'throughput': 2891.86}
195
+
196
+ [INFO|callbacks.py:319] 2024-08-29 20:58:06,181 >> {'loss': 0.5503, 'learning_rate': 1.0937e-07, 'epoch': 0.96, 'throughput': 2891.99}
197
+
198
+ [INFO|callbacks.py:319] 2024-08-29 20:59:02,816 >> {'loss': 0.5549, 'learning_rate': 3.0416e-09, 'epoch': 0.99, 'throughput': 2892.02}
199
+
200
+ [INFO|trainer.py:3503] 2024-08-29 20:59:14,245 >> Saving model checkpoint to saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/checkpoint-156
201
+
202
+ [INFO|configuration_utils.py:733] 2024-08-29 20:59:14,599 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/config.json
203
+
204
+ [INFO|configuration_utils.py:800] 2024-08-29 20:59:14,600 >> Model config MistralConfig {
205
+ "architectures": [
206
+ "MistralForCausalLM"
207
+ ],
208
+ "attention_dropout": 0.0,
209
+ "bos_token_id": 1,
210
+ "eos_token_id": 2,
211
+ "head_dim": 128,
212
+ "hidden_act": "silu",
213
+ "hidden_size": 4096,
214
+ "initializer_range": 0.02,
215
+ "intermediate_size": 14336,
216
+ "max_position_embeddings": 32768,
217
+ "model_type": "mistral",
218
+ "num_attention_heads": 32,
219
+ "num_hidden_layers": 32,
220
+ "num_key_value_heads": 8,
221
+ "rms_norm_eps": 1e-05,
222
+ "rope_theta": 10000.0,
223
+ "sliding_window": 4096,
224
+ "tie_word_embeddings": false,
225
+ "torch_dtype": "bfloat16",
226
+ "transformers_version": "4.43.4",
227
+ "use_cache": true,
228
+ "vocab_size": 32000
229
+ }
230
+
231
+
232
+ [INFO|tokenization_utils_base.py:2702] 2024-08-29 20:59:14,671 >> tokenizer config file saved in saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/checkpoint-156/tokenizer_config.json
233
+
234
+ [INFO|tokenization_utils_base.py:2711] 2024-08-29 20:59:14,671 >> Special tokens file saved in saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/checkpoint-156/special_tokens_map.json
235
+
236
+ [INFO|trainer.py:2394] 2024-08-29 20:59:14,879 >>
237
+
238
+ Training completed. Do not forget to share your model on huggingface.co/models =)
239
+
240
+
241
+
242
+ [INFO|trainer.py:3503] 2024-08-29 20:59:14,882 >> Saving model checkpoint to saves/Mistral-7B-v0.1-Chat/lora/mistral_physs
243
+
244
+ [INFO|configuration_utils.py:733] 2024-08-29 20:59:15,109 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/config.json
245
+
246
+ [INFO|configuration_utils.py:800] 2024-08-29 20:59:15,110 >> Model config MistralConfig {
247
+ "architectures": [
248
+ "MistralForCausalLM"
249
+ ],
250
+ "attention_dropout": 0.0,
251
+ "bos_token_id": 1,
252
+ "eos_token_id": 2,
253
+ "head_dim": 128,
254
+ "hidden_act": "silu",
255
+ "hidden_size": 4096,
256
+ "initializer_range": 0.02,
257
+ "intermediate_size": 14336,
258
+ "max_position_embeddings": 32768,
259
+ "model_type": "mistral",
260
+ "num_attention_heads": 32,
261
+ "num_hidden_layers": 32,
262
+ "num_key_value_heads": 8,
263
+ "rms_norm_eps": 1e-05,
264
+ "rope_theta": 10000.0,
265
+ "sliding_window": 4096,
266
+ "tie_word_embeddings": false,
267
+ "torch_dtype": "bfloat16",
268
+ "transformers_version": "4.43.4",
269
+ "use_cache": true,
270
+ "vocab_size": 32000
271
+ }
272
+
273
+
274
+ [INFO|tokenization_utils_base.py:2702] 2024-08-29 20:59:15,181 >> tokenizer config file saved in saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/tokenizer_config.json
275
+
276
+ [INFO|tokenization_utils_base.py:2711] 2024-08-29 20:59:15,182 >> Special tokens file saved in saves/Mistral-7B-v0.1-Chat/lora/mistral_physs/special_tokens_map.json
277
+
278
+ [WARNING|ploting.py:89] 2024-08-29 20:59:15,318 >> No metric eval_loss to plot.
279
+
280
+ [WARNING|ploting.py:89] 2024-08-29 20:59:15,318 >> No metric eval_accuracy to plot.
281
+
282
+ [INFO|modelcard.py:449] 2024-08-29 20:59:15,320 >> Dropping the following result as it does not have all the necessary fields:
283
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
284
+
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{{ '<s>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "spaces_between_special_tokens": false,
42
+ "split_special_tokens": false,
43
+ "tokenizer_class": "LlamaTokenizer",
44
+ "unk_token": "<unk>",
45
+ "use_default_system_prompt": false
46
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9984,
3
+ "num_input_tokens_seen": 5111808,
4
+ "total_flos": 2.1873318928633037e+17,
5
+ "train_loss": 0.5760276004289969,
6
+ "train_runtime": 1768.2955,
7
+ "train_samples_per_second": 5.655,
8
+ "train_steps_per_second": 0.088
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 156, "loss": 0.678, "learning_rate": 2.9924022525939684e-05, "epoch": 0.032, "percentage": 3.21, "elapsed_time": "0:00:56", "remaining_time": "0:28:41", "throughput": 2874.51, "total_tokens": 163840}
2
+ {"current_steps": 10, "total_steps": 156, "loss": 0.6551, "learning_rate": 2.9696859780634016e-05, "epoch": 0.064, "percentage": 6.41, "elapsed_time": "0:01:53", "remaining_time": "0:27:39", "throughput": 2882.14, "total_tokens": 327680}
3
+ {"current_steps": 15, "total_steps": 156, "loss": 0.6372, "learning_rate": 2.9320812997628184e-05, "epoch": 0.096, "percentage": 9.62, "elapsed_time": "0:02:50", "remaining_time": "0:26:40", "throughput": 2885.94, "total_tokens": 491520}
4
+ {"current_steps": 20, "total_steps": 156, "loss": 0.6201, "learning_rate": 2.8799691654882365e-05, "epoch": 0.128, "percentage": 12.82, "elapsed_time": "0:03:47", "remaining_time": "0:25:43", "throughput": 2886.5, "total_tokens": 655360}
5
+ {"current_steps": 25, "total_steps": 156, "loss": 0.5965, "learning_rate": 2.8138774883503317e-05, "epoch": 0.16, "percentage": 16.03, "elapsed_time": "0:04:43", "remaining_time": "0:24:45", "throughput": 2889.67, "total_tokens": 819200}
6
+ {"current_steps": 30, "total_steps": 156, "loss": 0.5959, "learning_rate": 2.7344757988404845e-05, "epoch": 0.192, "percentage": 19.23, "elapsed_time": "0:05:40", "remaining_time": "0:23:48", "throughput": 2889.27, "total_tokens": 983040}
7
+ {"current_steps": 35, "total_steps": 156, "loss": 0.6006, "learning_rate": 2.6425684622660387e-05, "epoch": 0.224, "percentage": 22.44, "elapsed_time": "0:06:36", "remaining_time": "0:22:51", "throughput": 2891.08, "total_tokens": 1146880}
8
+ {"current_steps": 40, "total_steps": 156, "loss": 0.594, "learning_rate": 2.5390865302643993e-05, "epoch": 0.256, "percentage": 25.64, "elapsed_time": "0:07:33", "remaining_time": "0:21:54", "throughput": 2891.9, "total_tokens": 1310720}
9
+ {"current_steps": 45, "total_steps": 156, "loss": 0.5825, "learning_rate": 2.425078308942815e-05, "epoch": 0.288, "percentage": 28.85, "elapsed_time": "0:08:29", "remaining_time": "0:20:57", "throughput": 2892.01, "total_tokens": 1474560}
10
+ {"current_steps": 50, "total_steps": 156, "loss": 0.5871, "learning_rate": 2.3016987391917016e-05, "epoch": 0.32, "percentage": 32.05, "elapsed_time": "0:09:26", "remaining_time": "0:20:01", "throughput": 2891.34, "total_tokens": 1638400}
11
+ {"current_steps": 55, "total_steps": 156, "loss": 0.5771, "learning_rate": 2.1701976967524388e-05, "epoch": 0.352, "percentage": 35.26, "elapsed_time": "0:10:23", "remaining_time": "0:19:04", "throughput": 2890.51, "total_tokens": 1802240}
12
+ {"current_steps": 60, "total_steps": 156, "loss": 0.5544, "learning_rate": 2.0319073305638035e-05, "epoch": 0.384, "percentage": 38.46, "elapsed_time": "0:11:19", "remaining_time": "0:18:07", "throughput": 2891.57, "total_tokens": 1966080}
13
+ {"current_steps": 65, "total_steps": 156, "loss": 0.5768, "learning_rate": 1.888228567653781e-05, "epoch": 0.416, "percentage": 41.67, "elapsed_time": "0:12:16", "remaining_time": "0:17:11", "throughput": 2892.08, "total_tokens": 2129920}
14
+ {"current_steps": 70, "total_steps": 156, "loss": 0.5534, "learning_rate": 1.7406169212866405e-05, "epoch": 0.448, "percentage": 44.87, "elapsed_time": "0:13:13", "remaining_time": "0:16:14", "throughput": 2891.55, "total_tokens": 2293760}
15
+ {"current_steps": 75, "total_steps": 156, "loss": 0.5597, "learning_rate": 1.5905677461334292e-05, "epoch": 0.48, "percentage": 48.08, "elapsed_time": "0:14:09", "remaining_time": "0:15:17", "throughput": 2891.32, "total_tokens": 2457600}
16
+ {"current_steps": 80, "total_steps": 156, "loss": 0.571, "learning_rate": 1.4396010898358778e-05, "epoch": 0.512, "percentage": 51.28, "elapsed_time": "0:15:06", "remaining_time": "0:14:21", "throughput": 2891.43, "total_tokens": 2621440}
17
+ {"current_steps": 85, "total_steps": 156, "loss": 0.5572, "learning_rate": 1.2892462944223613e-05, "epoch": 0.544, "percentage": 54.49, "elapsed_time": "0:16:03", "remaining_time": "0:13:24", "throughput": 2891.18, "total_tokens": 2785280}
18
+ {"current_steps": 90, "total_steps": 156, "loss": 0.5686, "learning_rate": 1.1410265035686639e-05, "epoch": 0.576, "percentage": 57.69, "elapsed_time": "0:16:59", "remaining_time": "0:12:27", "throughput": 2892.41, "total_tokens": 2949120}
19
+ {"current_steps": 95, "total_steps": 156, "loss": 0.5525, "learning_rate": 9.964432326500933e-06, "epoch": 0.608, "percentage": 60.9, "elapsed_time": "0:17:56", "remaining_time": "0:11:31", "throughput": 2892.02, "total_tokens": 3112960}
20
+ {"current_steps": 100, "total_steps": 156, "loss": 0.5594, "learning_rate": 8.569611578954186e-06, "epoch": 0.64, "percentage": 64.1, "elapsed_time": "0:18:53", "remaining_time": "0:10:34", "throughput": 2891.69, "total_tokens": 3276800}
21
+ {"current_steps": 105, "total_steps": 156, "loss": 0.5709, "learning_rate": 7.239932787335147e-06, "epoch": 0.672, "percentage": 67.31, "elapsed_time": "0:19:50", "remaining_time": "0:09:38", "throughput": 2890.41, "total_tokens": 3440640}
22
+ {"current_steps": 110, "total_steps": 156, "loss": 0.5555, "learning_rate": 5.988866036430314e-06, "epoch": 0.704, "percentage": 70.51, "elapsed_time": "0:20:46", "remaining_time": "0:08:41", "throughput": 2890.93, "total_tokens": 3604480}
23
+ {"current_steps": 115, "total_steps": 156, "loss": 0.5597, "learning_rate": 4.829085045121636e-06, "epoch": 0.736, "percentage": 73.72, "elapsed_time": "0:21:43", "remaining_time": "0:07:44", "throughput": 2891.4, "total_tokens": 3768320}
24
+ {"current_steps": 120, "total_steps": 156, "loss": 0.5432, "learning_rate": 3.772338777433482e-06, "epoch": 0.768, "percentage": 76.92, "elapsed_time": "0:22:40", "remaining_time": "0:06:48", "throughput": 2890.92, "total_tokens": 3932160}
25
+ {"current_steps": 125, "total_steps": 156, "loss": 0.5363, "learning_rate": 2.829332421651404e-06, "epoch": 0.8, "percentage": 80.13, "elapsed_time": "0:23:36", "remaining_time": "0:05:51", "throughput": 2891.26, "total_tokens": 4096000}
26
+ {"current_steps": 130, "total_steps": 156, "loss": 0.5582, "learning_rate": 2.0096189432334194e-06, "epoch": 0.832, "percentage": 83.33, "elapsed_time": "0:24:33", "remaining_time": "0:04:54", "throughput": 2891.48, "total_tokens": 4259840}
27
+ {"current_steps": 135, "total_steps": 156, "loss": 0.5539, "learning_rate": 1.321502310118649e-06, "epoch": 0.864, "percentage": 86.54, "elapsed_time": "0:25:29", "remaining_time": "0:03:57", "throughput": 2891.37, "total_tokens": 4423680}
28
+ {"current_steps": 140, "total_steps": 156, "loss": 0.5529, "learning_rate": 7.719533707928178e-07, "epoch": 0.896, "percentage": 89.74, "elapsed_time": "0:26:26", "remaining_time": "0:03:01", "throughput": 2891.6, "total_tokens": 4587520}
29
+ {"current_steps": 145, "total_steps": 156, "loss": 0.5584, "learning_rate": 3.665392372935922e-07, "epoch": 0.928, "percentage": 92.95, "elapsed_time": "0:27:23", "remaining_time": "0:02:04", "throughput": 2891.86, "total_tokens": 4751360}
30
+ {"current_steps": 150, "total_steps": 156, "loss": 0.5503, "learning_rate": 1.0936688852919042e-07, "epoch": 0.96, "percentage": 96.15, "elapsed_time": "0:28:19", "remaining_time": "0:01:07", "throughput": 2891.99, "total_tokens": 4915200}
31
+ {"current_steps": 155, "total_steps": 156, "loss": 0.5549, "learning_rate": 3.0415652272480776e-09, "epoch": 0.992, "percentage": 99.36, "elapsed_time": "0:29:16", "remaining_time": "0:00:11", "throughput": 2892.02, "total_tokens": 5079040}
32
+ {"current_steps": 156, "total_steps": 156, "epoch": 0.9984, "percentage": 100.0, "elapsed_time": "0:29:28", "remaining_time": "0:00:00", "throughput": 2890.82, "total_tokens": 5111808}
trainer_state.json ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9984,
5
+ "eval_steps": 500,
6
+ "global_step": 156,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.032,
13
+ "grad_norm": 0.3039781153202057,
14
+ "learning_rate": 2.9924022525939684e-05,
15
+ "loss": 0.678,
16
+ "num_input_tokens_seen": 163840,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.064,
21
+ "grad_norm": 0.2683364450931549,
22
+ "learning_rate": 2.9696859780634016e-05,
23
+ "loss": 0.6551,
24
+ "num_input_tokens_seen": 327680,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.096,
29
+ "grad_norm": 0.246008038520813,
30
+ "learning_rate": 2.9320812997628184e-05,
31
+ "loss": 0.6372,
32
+ "num_input_tokens_seen": 491520,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.128,
37
+ "grad_norm": 0.23320266604423523,
38
+ "learning_rate": 2.8799691654882365e-05,
39
+ "loss": 0.6201,
40
+ "num_input_tokens_seen": 655360,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.16,
45
+ "grad_norm": 0.24441155791282654,
46
+ "learning_rate": 2.8138774883503317e-05,
47
+ "loss": 0.5965,
48
+ "num_input_tokens_seen": 819200,
49
+ "step": 25
50
+ },
51
+ {
52
+ "epoch": 0.192,
53
+ "grad_norm": 0.2341088205575943,
54
+ "learning_rate": 2.7344757988404845e-05,
55
+ "loss": 0.5959,
56
+ "num_input_tokens_seen": 983040,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.224,
61
+ "grad_norm": 0.23481066524982452,
62
+ "learning_rate": 2.6425684622660387e-05,
63
+ "loss": 0.6006,
64
+ "num_input_tokens_seen": 1146880,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.256,
69
+ "grad_norm": 0.2456846982240677,
70
+ "learning_rate": 2.5390865302643993e-05,
71
+ "loss": 0.594,
72
+ "num_input_tokens_seen": 1310720,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.288,
77
+ "grad_norm": 0.25728079676628113,
78
+ "learning_rate": 2.425078308942815e-05,
79
+ "loss": 0.5825,
80
+ "num_input_tokens_seen": 1474560,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.32,
85
+ "grad_norm": 0.2505452334880829,
86
+ "learning_rate": 2.3016987391917016e-05,
87
+ "loss": 0.5871,
88
+ "num_input_tokens_seen": 1638400,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.352,
93
+ "grad_norm": 0.27519550919532776,
94
+ "learning_rate": 2.1701976967524388e-05,
95
+ "loss": 0.5771,
96
+ "num_input_tokens_seen": 1802240,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 0.384,
101
+ "grad_norm": 0.2705497741699219,
102
+ "learning_rate": 2.0319073305638035e-05,
103
+ "loss": 0.5544,
104
+ "num_input_tokens_seen": 1966080,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.416,
109
+ "grad_norm": 0.2861919701099396,
110
+ "learning_rate": 1.888228567653781e-05,
111
+ "loss": 0.5768,
112
+ "num_input_tokens_seen": 2129920,
113
+ "step": 65
114
+ },
115
+ {
116
+ "epoch": 0.448,
117
+ "grad_norm": 0.29395216703414917,
118
+ "learning_rate": 1.7406169212866405e-05,
119
+ "loss": 0.5534,
120
+ "num_input_tokens_seen": 2293760,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.48,
125
+ "grad_norm": 0.285727322101593,
126
+ "learning_rate": 1.5905677461334292e-05,
127
+ "loss": 0.5597,
128
+ "num_input_tokens_seen": 2457600,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 0.512,
133
+ "grad_norm": 0.30645114183425903,
134
+ "learning_rate": 1.4396010898358778e-05,
135
+ "loss": 0.571,
136
+ "num_input_tokens_seen": 2621440,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.544,
141
+ "grad_norm": 0.2912521958351135,
142
+ "learning_rate": 1.2892462944223613e-05,
143
+ "loss": 0.5572,
144
+ "num_input_tokens_seen": 2785280,
145
+ "step": 85
146
+ },
147
+ {
148
+ "epoch": 0.576,
149
+ "grad_norm": 0.3027022182941437,
150
+ "learning_rate": 1.1410265035686639e-05,
151
+ "loss": 0.5686,
152
+ "num_input_tokens_seen": 2949120,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.608,
157
+ "grad_norm": 0.32110294699668884,
158
+ "learning_rate": 9.964432326500933e-06,
159
+ "loss": 0.5525,
160
+ "num_input_tokens_seen": 3112960,
161
+ "step": 95
162
+ },
163
+ {
164
+ "epoch": 0.64,
165
+ "grad_norm": 0.31834056973457336,
166
+ "learning_rate": 8.569611578954186e-06,
167
+ "loss": 0.5594,
168
+ "num_input_tokens_seen": 3276800,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 0.672,
173
+ "grad_norm": 0.3223641812801361,
174
+ "learning_rate": 7.239932787335147e-06,
175
+ "loss": 0.5709,
176
+ "num_input_tokens_seen": 3440640,
177
+ "step": 105
178
+ },
179
+ {
180
+ "epoch": 0.704,
181
+ "grad_norm": 0.32236120104789734,
182
+ "learning_rate": 5.988866036430314e-06,
183
+ "loss": 0.5555,
184
+ "num_input_tokens_seen": 3604480,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 0.736,
189
+ "grad_norm": 0.32125550508499146,
190
+ "learning_rate": 4.829085045121636e-06,
191
+ "loss": 0.5597,
192
+ "num_input_tokens_seen": 3768320,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 0.768,
197
+ "grad_norm": 0.325810968875885,
198
+ "learning_rate": 3.772338777433482e-06,
199
+ "loss": 0.5432,
200
+ "num_input_tokens_seen": 3932160,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.8,
205
+ "grad_norm": 0.32014375925064087,
206
+ "learning_rate": 2.829332421651404e-06,
207
+ "loss": 0.5363,
208
+ "num_input_tokens_seen": 4096000,
209
+ "step": 125
210
+ },
211
+ {
212
+ "epoch": 0.832,
213
+ "grad_norm": 0.32001549005508423,
214
+ "learning_rate": 2.0096189432334194e-06,
215
+ "loss": 0.5582,
216
+ "num_input_tokens_seen": 4259840,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 0.864,
221
+ "grad_norm": 0.33245849609375,
222
+ "learning_rate": 1.321502310118649e-06,
223
+ "loss": 0.5539,
224
+ "num_input_tokens_seen": 4423680,
225
+ "step": 135
226
+ },
227
+ {
228
+ "epoch": 0.896,
229
+ "grad_norm": 0.3493448495864868,
230
+ "learning_rate": 7.719533707928178e-07,
231
+ "loss": 0.5529,
232
+ "num_input_tokens_seen": 4587520,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 0.928,
237
+ "grad_norm": 0.3279431462287903,
238
+ "learning_rate": 3.665392372935922e-07,
239
+ "loss": 0.5584,
240
+ "num_input_tokens_seen": 4751360,
241
+ "step": 145
242
+ },
243
+ {
244
+ "epoch": 0.96,
245
+ "grad_norm": 0.3298741281032562,
246
+ "learning_rate": 1.0936688852919042e-07,
247
+ "loss": 0.5503,
248
+ "num_input_tokens_seen": 4915200,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.992,
253
+ "grad_norm": 0.33253204822540283,
254
+ "learning_rate": 3.0415652272480776e-09,
255
+ "loss": 0.5549,
256
+ "num_input_tokens_seen": 5079040,
257
+ "step": 155
258
+ },
259
+ {
260
+ "epoch": 0.9984,
261
+ "num_input_tokens_seen": 5111808,
262
+ "step": 156,
263
+ "total_flos": 2.1873318928633037e+17,
264
+ "train_loss": 0.5760276004289969,
265
+ "train_runtime": 1768.2955,
266
+ "train_samples_per_second": 5.655,
267
+ "train_steps_per_second": 0.088
268
+ }
269
+ ],
270
+ "logging_steps": 5,
271
+ "max_steps": 156,
272
+ "num_input_tokens_seen": 5111808,
273
+ "num_train_epochs": 1,
274
+ "save_steps": 100,
275
+ "stateful_callbacks": {
276
+ "TrainerControl": {
277
+ "args": {
278
+ "should_epoch_stop": false,
279
+ "should_evaluate": false,
280
+ "should_log": false,
281
+ "should_save": true,
282
+ "should_training_stop": true
283
+ },
284
+ "attributes": {}
285
+ }
286
+ },
287
+ "total_flos": 2.1873318928633037e+17,
288
+ "train_batch_size": 8,
289
+ "trial_name": null,
290
+ "trial_params": null
291
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d873c6ea94c48195922f46796afd4fcd5262029afc0aa21fb929e23ac19d481
3
+ size 5368
training_args.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cutoff_len: 512
2
+ dataset: physics
3
+ dataset_dir: data
4
+ ddp_timeout: 180000000
5
+ do_train: true
6
+ finetuning_type: lora
7
+ flash_attn: auto
8
+ fp16: true
9
+ gradient_accumulation_steps: 8
10
+ include_num_input_tokens_seen: true
11
+ learning_rate: 3.0e-05
12
+ logging_steps: 5
13
+ lora_alpha: 16
14
+ lora_dropout: 0
15
+ lora_rank: 8
16
+ lora_target: all
17
+ lr_scheduler_type: cosine
18
+ max_grad_norm: 1.0
19
+ max_samples: 10000
20
+ model_name_or_path: mistralai/Mistral-7B-Instruct-v0.1
21
+ num_train_epochs: 1.0
22
+ optim: adamw_torch
23
+ output_dir: saves/Mistral-7B-v0.1-Chat/lora/mistral_physs
24
+ packing: false
25
+ per_device_train_batch_size: 8
26
+ plot_loss: true
27
+ preprocessing_num_workers: 16
28
+ report_to: none
29
+ save_steps: 100
30
+ stage: sft
31
+ template: mistral
32
+ warmup_steps: 0
training_loss.png ADDED