sengi commited on
Commit
b3a9d4f
·
verified ·
1 Parent(s): dc8737c

Model save

Browse files
README.md CHANGED
@@ -2,16 +2,12 @@
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - alignment-handbook
12
  - generated_from_trainer
13
  datasets:
14
- - HuggingFaceH4/ultrachat_200k
15
  base_model: mistralai/Mistral-7B-v0.1
16
  model-index:
17
  - name: zephyr-7b-pl-qlora
@@ -23,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # zephyr-7b-pl-qlora
25
 
26
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
27
 
28
  ## Model description
29
 
@@ -42,11 +38,13 @@ More information needed
42
  ### Training hyperparameters
43
 
44
  The following hyperparameters were used during training:
45
- - learning_rate: 0.0002
46
  - train_batch_size: 2
47
  - eval_batch_size: 4
48
  - seed: 42
49
  - distributed_type: multi-GPU
 
 
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.1
 
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - alignment-handbook
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
11
  base_model: mistralai/Mistral-7B-v0.1
12
  model-index:
13
  - name: zephyr-7b-pl-qlora
 
19
 
20
  # zephyr-7b-pl-qlora
21
 
22
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
23
 
24
  ## Model description
25
 
 
38
  ### Training hyperparameters
39
 
40
  The following hyperparameters were used during training:
41
+ - learning_rate: 5e-07
42
  - train_batch_size: 2
43
  - eval_batch_size: 4
44
  - seed: 42
45
  - distributed_type: multi-GPU
46
+ - gradient_accumulation_steps: 4
47
+ - total_train_batch_size: 8
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: cosine
50
  - lr_scheduler_warmup_ratio: 0.1
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 501.823,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 0.399,
7
- "train_steps_per_second": 0.199
8
  }
 
1
  {
2
+ "epoch": 0.01,
3
+ "train_loss": 0.6359813857078552,
4
+ "train_runtime": 2120.9698,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 0.377,
7
+ "train_steps_per_second": 0.047
8
  }
lora_0/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "down_proj",
27
- "o_proj",
28
- "k_proj",
29
  "gate_proj",
30
- "v_proj",
31
  "up_proj",
32
- "q_proj"
 
 
 
 
33
  ],
34
  "task_type": null,
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
26
  "gate_proj",
 
27
  "up_proj",
28
+ "q_proj",
29
+ "v_proj",
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj"
33
  ],
34
  "task_type": null,
35
  "use_dora": false,
lora_0/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad8b98a44de7f55dc76067740df02dc07c1c3684a06d731190cc46be3c07c1b0
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d95350f5041ab4e9166125ad0900a94e5edbb0d39d4fdfef11b30e664c3a0d10
3
  size 167832240
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 501.823,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 0.399,
7
- "train_steps_per_second": 0.199
8
  }
 
1
  {
2
+ "epoch": 0.01,
3
+ "train_loss": 0.6359813857078552,
4
+ "train_runtime": 2120.9698,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 0.377,
7
+ "train_steps_per_second": 0.047
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0014344526845781992,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
@@ -10,56 +10,146 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 2e-05,
14
- "loss": 0.0,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
- "learning_rate": 0.00019396926207859084,
20
- "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.0,
25
- "learning_rate": 0.00015000000000000001,
26
- "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "step": 40
28
  },
29
  {
30
  "epoch": 0.0,
31
- "learning_rate": 8.263518223330697e-05,
32
- "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  "step": 60
34
  },
35
  {
36
  "epoch": 0.0,
37
- "learning_rate": 2.339555568810221e-05,
38
- "loss": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "step": 80
40
  },
41
  {
42
  "epoch": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  "learning_rate": 0.0,
44
- "loss": 0.0,
45
  "step": 100
46
  },
47
  {
48
- "epoch": 0.0,
49
  "step": 100,
50
- "total_flos": 1.768131718545408e+16,
51
- "train_loss": 0.0,
52
- "train_runtime": 501.823,
53
- "train_samples_per_second": 0.399,
54
- "train_steps_per_second": 0.199
55
  }
56
  ],
57
- "logging_steps": 20,
58
  "max_steps": 100,
59
  "num_input_tokens_seen": 0,
60
  "num_train_epochs": 1,
61
  "save_steps": 100,
62
- "total_flos": 1.768131718545408e+16,
63
  "train_batch_size": 2,
64
  "trial_name": null,
65
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.005737810738312797,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 5e-08,
14
+ "loss": 0.6931,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
+ "learning_rate": 2.5e-07,
20
+ "loss": 0.6931,
21
+ "step": 5
22
+ },
23
+ {
24
+ "epoch": 0.0,
25
+ "learning_rate": 5e-07,
26
+ "loss": 0.6921,
27
+ "step": 10
28
+ },
29
+ {
30
+ "epoch": 0.0,
31
+ "learning_rate": 4.96201938253052e-07,
32
+ "loss": 0.6886,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.0,
37
+ "learning_rate": 4.849231551964771e-07,
38
+ "loss": 0.6827,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.0,
43
+ "learning_rate": 4.6650635094610966e-07,
44
+ "loss": 0.6745,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.0,
49
+ "learning_rate": 4.415111107797445e-07,
50
+ "loss": 0.6645,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.0,
55
+ "learning_rate": 4.106969024216348e-07,
56
+ "loss": 0.6549,
57
+ "step": 35
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 3.75e-07,
62
+ "loss": 0.6463,
63
  "step": 40
64
  },
65
  {
66
  "epoch": 0.0,
67
+ "learning_rate": 3.355050358314172e-07,
68
+ "loss": 0.6395,
69
+ "step": 45
70
+ },
71
+ {
72
+ "epoch": 0.0,
73
+ "learning_rate": 2.934120444167326e-07,
74
+ "loss": 0.6305,
75
+ "step": 50
76
+ },
77
+ {
78
+ "epoch": 0.0,
79
+ "learning_rate": 2.5e-07,
80
+ "loss": 0.6241,
81
+ "step": 55
82
+ },
83
+ {
84
+ "epoch": 0.0,
85
+ "learning_rate": 2.065879555832674e-07,
86
+ "loss": 0.6155,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 0.0,
91
+ "learning_rate": 1.6449496416858282e-07,
92
+ "loss": 0.6147,
93
+ "step": 65
94
+ },
95
+ {
96
+ "epoch": 0.0,
97
+ "learning_rate": 1.2500000000000005e-07,
98
+ "loss": 0.6042,
99
+ "step": 70
100
+ },
101
+ {
102
+ "epoch": 0.0,
103
+ "learning_rate": 8.930309757836516e-08,
104
+ "loss": 0.6041,
105
+ "step": 75
106
+ },
107
+ {
108
+ "epoch": 0.0,
109
+ "learning_rate": 5.848888922025552e-08,
110
+ "loss": 0.5989,
111
  "step": 80
112
  },
113
  {
114
  "epoch": 0.0,
115
+ "learning_rate": 3.349364905389032e-08,
116
+ "loss": 0.5988,
117
+ "step": 85
118
+ },
119
+ {
120
+ "epoch": 0.01,
121
+ "learning_rate": 1.507684480352292e-08,
122
+ "loss": 0.5961,
123
+ "step": 90
124
+ },
125
+ {
126
+ "epoch": 0.01,
127
+ "learning_rate": 3.798061746947995e-09,
128
+ "loss": 0.5984,
129
+ "step": 95
130
+ },
131
+ {
132
+ "epoch": 0.01,
133
  "learning_rate": 0.0,
134
+ "loss": 0.5982,
135
  "step": 100
136
  },
137
  {
138
+ "epoch": 0.01,
139
  "step": 100,
140
+ "total_flos": 7.072526927868723e+16,
141
+ "train_loss": 0.6359813857078552,
142
+ "train_runtime": 2120.9698,
143
+ "train_samples_per_second": 0.377,
144
+ "train_steps_per_second": 0.047
145
  }
146
  ],
147
+ "logging_steps": 5,
148
  "max_steps": 100,
149
  "num_input_tokens_seen": 0,
150
  "num_train_epochs": 1,
151
  "save_steps": 100,
152
+ "total_flos": 7.072526927868723e+16,
153
  "train_batch_size": 2,
154
  "trial_name": null,
155
  "trial_params": null