chansung commited on
Commit
e6d5524
1 Parent(s): bb2fb6f

Model save

Browse files
Files changed (4) hide show
  1. README.md +69 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +253 -0
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: apache-2.0
4
+ base_model: mistralai/Mistral-7B-v0.3
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: mistral7b-milora-summarization-11-v1
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # mistral7b-milora-summarization-11-v1
20
+
21
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 2.0168
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 14
44
+ - eval_batch_size: 14
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 224
50
+ - total_eval_batch_size: 112
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 1
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.3739 | 0.9965 | 142 | 2.0168 |
61
+
62
+
63
+ ### Framework versions
64
+
65
+ - PEFT 0.13.3.dev0
66
+ - Transformers 4.46.3
67
+ - Pytorch 2.3.1+cu121
68
+ - Datasets 3.1.0
69
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9964912280701754,
3
+ "total_flos": 6.954534912540017e+17,
4
+ "train_loss": 1.4842075613183034,
5
+ "train_runtime": 638.3404,
6
+ "train_samples": 129221,
7
+ "train_samples_per_second": 49.937,
8
+ "train_steps_per_second": 0.222
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9964912280701754,
3
+ "total_flos": 6.954534912540017e+17,
4
+ "train_loss": 1.4842075613183034,
5
+ "train_runtime": 638.3404,
6
+ "train_samples": 129221,
7
+ "train_samples_per_second": 49.937,
8
+ "train_steps_per_second": 0.222
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9964912280701754,
5
+ "eval_steps": 500,
6
+ "global_step": 142,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007017543859649123,
13
+ "grad_norm": 3.2127857208251953,
14
+ "learning_rate": 1.3333333333333333e-05,
15
+ "loss": 2.0873,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.03508771929824561,
20
+ "grad_norm": 2.7192869186401367,
21
+ "learning_rate": 6.666666666666667e-05,
22
+ "loss": 2.0735,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.07017543859649122,
27
+ "grad_norm": 1.4391745328903198,
28
+ "learning_rate": 0.00013333333333333334,
29
+ "loss": 1.989,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.10526315789473684,
34
+ "grad_norm": 1.263310432434082,
35
+ "learning_rate": 0.0002,
36
+ "loss": 1.9059,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.14035087719298245,
41
+ "grad_norm": 1.1776515245437622,
42
+ "learning_rate": 0.00019923607874151032,
43
+ "loss": 1.7644,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.17543859649122806,
48
+ "grad_norm": 0.7352864146232605,
49
+ "learning_rate": 0.00019695598647982468,
50
+ "loss": 1.6067,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.21052631578947367,
55
+ "grad_norm": 0.5063140392303467,
56
+ "learning_rate": 0.00019319455943394347,
57
+ "loss": 1.5369,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.24561403508771928,
62
+ "grad_norm": 0.4345757067203522,
63
+ "learning_rate": 0.00018800926628551886,
64
+ "loss": 1.4784,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.2807017543859649,
69
+ "grad_norm": 0.29390987753868103,
70
+ "learning_rate": 0.00018147933014790244,
71
+ "loss": 1.4507,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.3157894736842105,
76
+ "grad_norm": 0.30241647362709045,
77
+ "learning_rate": 0.0001737045181617364,
78
+ "loss": 1.441,
79
+ "step": 45
80
+ },
81
+ {
82
+ "epoch": 0.3508771929824561,
83
+ "grad_norm": 0.32600656151771545,
84
+ "learning_rate": 0.00016480361721016054,
85
+ "loss": 1.4313,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.38596491228070173,
90
+ "grad_norm": 0.3142453730106354,
91
+ "learning_rate": 0.00015491261904230727,
92
+ "loss": 1.4181,
93
+ "step": 55
94
+ },
95
+ {
96
+ "epoch": 0.42105263157894735,
97
+ "grad_norm": 0.26200494170188904,
98
+ "learning_rate": 0.0001441826425335387,
99
+ "loss": 1.4184,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.45614035087719296,
104
+ "grad_norm": 0.23935051262378693,
105
+ "learning_rate": 0.00013277762482701767,
106
+ "loss": 1.4003,
107
+ "step": 65
108
+ },
109
+ {
110
+ "epoch": 0.49122807017543857,
111
+ "grad_norm": 0.2776546776294708,
112
+ "learning_rate": 0.00012087181663233354,
113
+ "loss": 1.3959,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.5263157894736842,
118
+ "grad_norm": 0.26008203625679016,
119
+ "learning_rate": 0.00010864711994907458,
120
+ "loss": 1.3973,
121
+ "step": 75
122
+ },
123
+ {
124
+ "epoch": 0.5614035087719298,
125
+ "grad_norm": 0.2285134345293045,
126
+ "learning_rate": 9.629030889073949e-05,
127
+ "loss": 1.3859,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.5964912280701754,
132
+ "grad_norm": 0.2977670431137085,
133
+ "learning_rate": 8.399017607042025e-05,
134
+ "loss": 1.3798,
135
+ "step": 85
136
+ },
137
+ {
138
+ "epoch": 0.631578947368421,
139
+ "grad_norm": 0.26052919030189514,
140
+ "learning_rate": 7.193464814699073e-05,
141
+ "loss": 1.378,
142
+ "step": 90
143
+ },
144
+ {
145
+ "epoch": 0.6666666666666666,
146
+ "grad_norm": 0.2686891257762909,
147
+ "learning_rate": 6.0307914601711305e-05,
148
+ "loss": 1.3713,
149
+ "step": 95
150
+ },
151
+ {
152
+ "epoch": 0.7017543859649122,
153
+ "grad_norm": 0.24683193862438202,
154
+ "learning_rate": 4.928761361302269e-05,
155
+ "loss": 1.3818,
156
+ "step": 100
157
+ },
158
+ {
159
+ "epoch": 0.7368421052631579,
160
+ "grad_norm": 0.26356831192970276,
161
+ "learning_rate": 3.904211802492922e-05,
162
+ "loss": 1.3778,
163
+ "step": 105
164
+ },
165
+ {
166
+ "epoch": 0.7719298245614035,
167
+ "grad_norm": 0.25175535678863525,
168
+ "learning_rate": 2.9727962875101e-05,
169
+ "loss": 1.3744,
170
+ "step": 110
171
+ },
172
+ {
173
+ "epoch": 0.8070175438596491,
174
+ "grad_norm": 0.27307969331741333,
175
+ "learning_rate": 2.1487453786014512e-05,
176
+ "loss": 1.3687,
177
+ "step": 115
178
+ },
179
+ {
180
+ "epoch": 0.8421052631578947,
181
+ "grad_norm": 0.22797873616218567,
182
+ "learning_rate": 1.4446492759148411e-05,
183
+ "loss": 1.3829,
184
+ "step": 120
185
+ },
186
+ {
187
+ "epoch": 0.8771929824561403,
188
+ "grad_norm": 0.24017217755317688,
189
+ "learning_rate": 8.712654590675085e-06,
190
+ "loss": 1.3692,
191
+ "step": 125
192
+ },
193
+ {
194
+ "epoch": 0.9122807017543859,
195
+ "grad_norm": 0.22941164672374725,
196
+ "learning_rate": 4.37354329798726e-06,
197
+ "loss": 1.3739,
198
+ "step": 130
199
+ },
200
+ {
201
+ "epoch": 0.9473684210526315,
202
+ "grad_norm": 0.21575631201267242,
203
+ "learning_rate": 1.4954536682736719e-06,
204
+ "loss": 1.3737,
205
+ "step": 135
206
+ },
207
+ {
208
+ "epoch": 0.9824561403508771,
209
+ "grad_norm": 0.2508339583873749,
210
+ "learning_rate": 1.2235837857387246e-07,
211
+ "loss": 1.3739,
212
+ "step": 140
213
+ },
214
+ {
215
+ "epoch": 0.9964912280701754,
216
+ "eval_loss": 2.0167758464813232,
217
+ "eval_runtime": 0.7,
218
+ "eval_samples_per_second": 34.285,
219
+ "eval_steps_per_second": 1.429,
220
+ "step": 142
221
+ },
222
+ {
223
+ "epoch": 0.9964912280701754,
224
+ "step": 142,
225
+ "total_flos": 6.954534912540017e+17,
226
+ "train_loss": 1.4842075613183034,
227
+ "train_runtime": 638.3404,
228
+ "train_samples_per_second": 49.937,
229
+ "train_steps_per_second": 0.222
230
+ }
231
+ ],
232
+ "logging_steps": 5,
233
+ "max_steps": 142,
234
+ "num_input_tokens_seen": 0,
235
+ "num_train_epochs": 1,
236
+ "save_steps": 100,
237
+ "stateful_callbacks": {
238
+ "TrainerControl": {
239
+ "args": {
240
+ "should_epoch_stop": false,
241
+ "should_evaluate": false,
242
+ "should_log": false,
243
+ "should_save": true,
244
+ "should_training_stop": true
245
+ },
246
+ "attributes": {}
247
+ }
248
+ },
249
+ "total_flos": 6.954534912540017e+17,
250
+ "train_batch_size": 14,
251
+ "trial_name": null,
252
+ "trial_params": null
253
+ }