totally-not-an-llm commited on
Commit
5207847
1 Parent(s): 6103331

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,21 @@
1
  ---
2
- license: llama2
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ library_name: peft
3
  ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "NousResearch/Llama-2-13b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "up_proj",
18
+ "q_proj",
19
+ "down_proj",
20
+ "v_proj",
21
+ "o_proj",
22
+ "k_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b1d65bac0708cdfdd5a772caba6f0eb3e1f531ad8f6275eaa74ef0db869f070
3
+ size 500897101
checkpoint-108/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-108/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "NousResearch/Llama-2-13b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "up_proj",
18
+ "q_proj",
19
+ "down_proj",
20
+ "v_proj",
21
+ "o_proj",
22
+ "k_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-108/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbbf430d08b1049c4dd9eb144f5bd690ab8d598cf8e525c0ea412cc0a96e1fa4
3
+ size 500897101
checkpoint-108/adapter_model/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-108/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "NousResearch/Llama-2-13b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "up_proj",
18
+ "q_proj",
19
+ "down_proj",
20
+ "v_proj",
21
+ "o_proj",
22
+ "k_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-108/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbbf430d08b1049c4dd9eb144f5bd690ab8d598cf8e525c0ea412cc0a96e1fa4
3
+ size 500897101
checkpoint-108/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20a47089af399b903c96f12857b3af646757c59ae533849d218ea807ecb3c8e2
3
+ size 1001736445
checkpoint-108/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56bc8d7b2ee16abc80da72ef01d4d6d1aa36022acbff2760916d933468ad1002
3
+ size 14575
checkpoint-108/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bcf01ce42716da0dfada32a20d14411aad2fad0b89b203137a6ae2c9954d075
3
+ size 627
checkpoint-108/trainer_state.json ADDED
@@ -0,0 +1,707 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 20,
6
+ "global_step": 108,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.9768,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.07,
19
+ "learning_rate": 4e-05,
20
+ "loss": 1.0553,
21
+ "step": 2
22
+ },
23
+ {
24
+ "epoch": 0.11,
25
+ "learning_rate": 6e-05,
26
+ "loss": 0.9074,
27
+ "step": 3
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 8e-05,
32
+ "loss": 1.0351,
33
+ "step": 4
34
+ },
35
+ {
36
+ "epoch": 0.19,
37
+ "learning_rate": 0.0001,
38
+ "loss": 0.9918,
39
+ "step": 5
40
+ },
41
+ {
42
+ "epoch": 0.22,
43
+ "learning_rate": 0.00012,
44
+ "loss": 0.9872,
45
+ "step": 6
46
+ },
47
+ {
48
+ "epoch": 0.26,
49
+ "learning_rate": 0.00014,
50
+ "loss": 0.9573,
51
+ "step": 7
52
+ },
53
+ {
54
+ "epoch": 0.3,
55
+ "learning_rate": 0.00016,
56
+ "loss": 1.0466,
57
+ "step": 8
58
+ },
59
+ {
60
+ "epoch": 0.33,
61
+ "learning_rate": 0.00018,
62
+ "loss": 0.8995,
63
+ "step": 9
64
+ },
65
+ {
66
+ "epoch": 0.37,
67
+ "learning_rate": 0.0002,
68
+ "loss": 0.9041,
69
+ "step": 10
70
+ },
71
+ {
72
+ "epoch": 0.41,
73
+ "learning_rate": 0.00019996841892833,
74
+ "loss": 0.936,
75
+ "step": 11
76
+ },
77
+ {
78
+ "epoch": 0.44,
79
+ "learning_rate": 0.00019987369566060176,
80
+ "loss": 0.8254,
81
+ "step": 12
82
+ },
83
+ {
84
+ "epoch": 0.48,
85
+ "learning_rate": 0.0001997158900260614,
86
+ "loss": 0.9508,
87
+ "step": 13
88
+ },
89
+ {
90
+ "epoch": 0.52,
91
+ "learning_rate": 0.00019949510169813003,
92
+ "loss": 0.929,
93
+ "step": 14
94
+ },
95
+ {
96
+ "epoch": 0.56,
97
+ "learning_rate": 0.0001992114701314478,
98
+ "loss": 0.9618,
99
+ "step": 15
100
+ },
101
+ {
102
+ "epoch": 0.59,
103
+ "learning_rate": 0.0001988651744737914,
104
+ "loss": 0.9317,
105
+ "step": 16
106
+ },
107
+ {
108
+ "epoch": 0.63,
109
+ "learning_rate": 0.00019845643345292054,
110
+ "loss": 0.9399,
111
+ "step": 17
112
+ },
113
+ {
114
+ "epoch": 0.67,
115
+ "learning_rate": 0.0001979855052384247,
116
+ "loss": 0.9377,
117
+ "step": 18
118
+ },
119
+ {
120
+ "epoch": 0.7,
121
+ "learning_rate": 0.00019745268727865774,
122
+ "loss": 0.9048,
123
+ "step": 19
124
+ },
125
+ {
126
+ "epoch": 0.74,
127
+ "learning_rate": 0.0001968583161128631,
128
+ "loss": 0.9311,
129
+ "step": 20
130
+ },
131
+ {
132
+ "epoch": 0.74,
133
+ "eval_loss": 0.8045752644538879,
134
+ "eval_runtime": 2.684,
135
+ "eval_samples_per_second": 1.118,
136
+ "eval_steps_per_second": 0.745,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.78,
141
+ "learning_rate": 0.0001962027671586086,
142
+ "loss": 0.9376,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.81,
147
+ "learning_rate": 0.00019548645447466431,
148
+ "loss": 0.8598,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.85,
153
+ "learning_rate": 0.00019470983049947444,
154
+ "loss": 0.991,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.89,
159
+ "learning_rate": 0.00019387338576538744,
160
+ "loss": 0.8472,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.93,
165
+ "learning_rate": 0.00019297764858882514,
166
+ "loss": 0.8818,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.96,
171
+ "learning_rate": 0.00019202318473658705,
172
+ "loss": 0.8879,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 1.0,
177
+ "learning_rate": 0.00019101059706849957,
178
+ "loss": 0.8483,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 1.04,
183
+ "learning_rate": 0.0001899405251566371,
184
+ "loss": 0.9505,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 1.07,
189
+ "learning_rate": 0.00018881364488135448,
190
+ "loss": 0.9116,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 1.11,
195
+ "learning_rate": 0.00018763066800438636,
196
+ "loss": 0.8575,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 1.15,
201
+ "learning_rate": 0.00018639234171928353,
202
+ "loss": 0.8093,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 1.19,
207
+ "learning_rate": 0.00018509944817946922,
208
+ "loss": 0.7966,
209
+ "step": 32
210
+ },
211
+ {
212
+ "epoch": 1.22,
213
+ "learning_rate": 0.0001837528040042142,
214
+ "loss": 0.8263,
215
+ "step": 33
216
+ },
217
+ {
218
+ "epoch": 1.26,
219
+ "learning_rate": 0.00018235325976284275,
220
+ "loss": 0.7951,
221
+ "step": 34
222
+ },
223
+ {
224
+ "epoch": 1.3,
225
+ "learning_rate": 0.00018090169943749476,
226
+ "loss": 0.849,
227
+ "step": 35
228
+ },
229
+ {
230
+ "epoch": 1.33,
231
+ "learning_rate": 0.00017939903986478355,
232
+ "loss": 0.863,
233
+ "step": 36
234
+ },
235
+ {
236
+ "epoch": 1.37,
237
+ "learning_rate": 0.00017784623015670238,
238
+ "loss": 0.8144,
239
+ "step": 37
240
+ },
241
+ {
242
+ "epoch": 1.41,
243
+ "learning_rate": 0.0001762442511011448,
244
+ "loss": 0.8078,
245
+ "step": 38
246
+ },
247
+ {
248
+ "epoch": 1.44,
249
+ "learning_rate": 0.00017459411454241822,
250
+ "loss": 0.7997,
251
+ "step": 39
252
+ },
253
+ {
254
+ "epoch": 1.48,
255
+ "learning_rate": 0.00017289686274214118,
256
+ "loss": 0.9322,
257
+ "step": 40
258
+ },
259
+ {
260
+ "epoch": 1.48,
261
+ "eval_loss": 0.7793169617652893,
262
+ "eval_runtime": 2.6811,
263
+ "eval_samples_per_second": 1.119,
264
+ "eval_steps_per_second": 0.746,
265
+ "step": 40
266
+ },
267
+ {
268
+ "epoch": 1.52,
269
+ "learning_rate": 0.00017115356772092857,
270
+ "loss": 0.8279,
271
+ "step": 41
272
+ },
273
+ {
274
+ "epoch": 1.56,
275
+ "learning_rate": 0.0001693653305812805,
276
+ "loss": 0.8759,
277
+ "step": 42
278
+ },
279
+ {
280
+ "epoch": 1.59,
281
+ "learning_rate": 0.00016753328081210245,
282
+ "loss": 0.8748,
283
+ "step": 43
284
+ },
285
+ {
286
+ "epoch": 1.63,
287
+ "learning_rate": 0.00016565857557529566,
288
+ "loss": 0.7638,
289
+ "step": 44
290
+ },
291
+ {
292
+ "epoch": 1.67,
293
+ "learning_rate": 0.000163742398974869,
294
+ "loss": 0.7941,
295
+ "step": 45
296
+ },
297
+ {
298
+ "epoch": 1.7,
299
+ "learning_rate": 0.00016178596130903344,
300
+ "loss": 0.8321,
301
+ "step": 46
302
+ },
303
+ {
304
+ "epoch": 1.74,
305
+ "learning_rate": 0.0001597904983057519,
306
+ "loss": 0.894,
307
+ "step": 47
308
+ },
309
+ {
310
+ "epoch": 1.78,
311
+ "learning_rate": 0.00015775727034222675,
312
+ "loss": 0.9176,
313
+ "step": 48
314
+ },
315
+ {
316
+ "epoch": 1.81,
317
+ "learning_rate": 0.00015568756164881882,
318
+ "loss": 0.8286,
319
+ "step": 49
320
+ },
321
+ {
322
+ "epoch": 1.85,
323
+ "learning_rate": 0.00015358267949789966,
324
+ "loss": 0.9328,
325
+ "step": 50
326
+ },
327
+ {
328
+ "epoch": 1.89,
329
+ "learning_rate": 0.00015144395337815064,
330
+ "loss": 0.8644,
331
+ "step": 51
332
+ },
333
+ {
334
+ "epoch": 1.93,
335
+ "learning_rate": 0.00014927273415482915,
336
+ "loss": 0.7769,
337
+ "step": 52
338
+ },
339
+ {
340
+ "epoch": 1.96,
341
+ "learning_rate": 0.0001470703932165333,
342
+ "loss": 0.8,
343
+ "step": 53
344
+ },
345
+ {
346
+ "epoch": 2.0,
347
+ "learning_rate": 0.00014483832160900326,
348
+ "loss": 0.7781,
349
+ "step": 54
350
+ },
351
+ {
352
+ "epoch": 2.04,
353
+ "learning_rate": 0.00014257792915650728,
354
+ "loss": 0.7852,
355
+ "step": 55
356
+ },
357
+ {
358
+ "epoch": 2.07,
359
+ "learning_rate": 0.00014029064357136628,
360
+ "loss": 0.7796,
361
+ "step": 56
362
+ },
363
+ {
364
+ "epoch": 2.11,
365
+ "learning_rate": 0.00013797790955218014,
366
+ "loss": 0.8287,
367
+ "step": 57
368
+ },
369
+ {
370
+ "epoch": 2.15,
371
+ "learning_rate": 0.00013564118787132506,
372
+ "loss": 0.6845,
373
+ "step": 58
374
+ },
375
+ {
376
+ "epoch": 2.19,
377
+ "learning_rate": 0.00013328195445229868,
378
+ "loss": 0.7821,
379
+ "step": 59
380
+ },
381
+ {
382
+ "epoch": 2.22,
383
+ "learning_rate": 0.00013090169943749476,
384
+ "loss": 0.708,
385
+ "step": 60
386
+ },
387
+ {
388
+ "epoch": 2.22,
389
+ "eval_loss": 0.7880761027336121,
390
+ "eval_runtime": 2.6843,
391
+ "eval_samples_per_second": 1.118,
392
+ "eval_steps_per_second": 0.745,
393
+ "step": 60
394
+ },
395
+ {
396
+ "epoch": 2.26,
397
+ "learning_rate": 0.0001285019262469976,
398
+ "loss": 0.8098,
399
+ "step": 61
400
+ },
401
+ {
402
+ "epoch": 2.3,
403
+ "learning_rate": 0.00012608415062898972,
404
+ "loss": 0.82,
405
+ "step": 62
406
+ },
407
+ {
408
+ "epoch": 2.33,
409
+ "learning_rate": 0.00012364989970237248,
410
+ "loss": 0.7187,
411
+ "step": 63
412
+ },
413
+ {
414
+ "epoch": 2.37,
415
+ "learning_rate": 0.00012120071099220549,
416
+ "loss": 0.7802,
417
+ "step": 64
418
+ },
419
+ {
420
+ "epoch": 2.41,
421
+ "learning_rate": 0.00011873813145857249,
422
+ "loss": 0.6834,
423
+ "step": 65
424
+ },
425
+ {
426
+ "epoch": 2.44,
427
+ "learning_rate": 0.00011626371651948838,
428
+ "loss": 0.6808,
429
+ "step": 66
430
+ },
431
+ {
432
+ "epoch": 2.48,
433
+ "learning_rate": 0.0001137790290684638,
434
+ "loss": 0.7881,
435
+ "step": 67
436
+ },
437
+ {
438
+ "epoch": 2.52,
439
+ "learning_rate": 0.00011128563848734816,
440
+ "loss": 0.7281,
441
+ "step": 68
442
+ },
443
+ {
444
+ "epoch": 2.56,
445
+ "learning_rate": 0.00010878511965507434,
446
+ "loss": 0.7231,
447
+ "step": 69
448
+ },
449
+ {
450
+ "epoch": 2.59,
451
+ "learning_rate": 0.00010627905195293135,
452
+ "loss": 0.6938,
453
+ "step": 70
454
+ },
455
+ {
456
+ "epoch": 2.63,
457
+ "learning_rate": 0.00010376901826699348,
458
+ "loss": 0.7633,
459
+ "step": 71
460
+ },
461
+ {
462
+ "epoch": 2.67,
463
+ "learning_rate": 0.00010125660398833528,
464
+ "loss": 0.8253,
465
+ "step": 72
466
+ },
467
+ {
468
+ "epoch": 2.7,
469
+ "learning_rate": 9.874339601166473e-05,
470
+ "loss": 0.8197,
471
+ "step": 73
472
+ },
473
+ {
474
+ "epoch": 2.74,
475
+ "learning_rate": 9.623098173300654e-05,
476
+ "loss": 0.7403,
477
+ "step": 74
478
+ },
479
+ {
480
+ "epoch": 2.78,
481
+ "learning_rate": 9.372094804706867e-05,
482
+ "loss": 0.8175,
483
+ "step": 75
484
+ },
485
+ {
486
+ "epoch": 2.81,
487
+ "learning_rate": 9.121488034492569e-05,
488
+ "loss": 0.7249,
489
+ "step": 76
490
+ },
491
+ {
492
+ "epoch": 2.85,
493
+ "learning_rate": 8.871436151265184e-05,
494
+ "loss": 0.7029,
495
+ "step": 77
496
+ },
497
+ {
498
+ "epoch": 2.89,
499
+ "learning_rate": 8.62209709315362e-05,
500
+ "loss": 0.8081,
501
+ "step": 78
502
+ },
503
+ {
504
+ "epoch": 2.93,
505
+ "learning_rate": 8.373628348051165e-05,
506
+ "loss": 0.7087,
507
+ "step": 79
508
+ },
509
+ {
510
+ "epoch": 2.96,
511
+ "learning_rate": 8.126186854142752e-05,
512
+ "loss": 0.762,
513
+ "step": 80
514
+ },
515
+ {
516
+ "epoch": 2.96,
517
+ "eval_loss": 0.7806326746940613,
518
+ "eval_runtime": 2.6841,
519
+ "eval_samples_per_second": 1.118,
520
+ "eval_steps_per_second": 0.745,
521
+ "step": 80
522
+ },
523
+ {
524
+ "epoch": 3.0,
525
+ "learning_rate": 7.879928900779456e-05,
526
+ "loss": 0.6724,
527
+ "step": 81
528
+ },
529
+ {
530
+ "epoch": 3.04,
531
+ "learning_rate": 7.635010029762756e-05,
532
+ "loss": 0.578,
533
+ "step": 82
534
+ },
535
+ {
536
+ "epoch": 3.07,
537
+ "learning_rate": 7.391584937101033e-05,
538
+ "loss": 0.6599,
539
+ "step": 83
540
+ },
541
+ {
542
+ "epoch": 3.11,
543
+ "learning_rate": 7.149807375300239e-05,
544
+ "loss": 0.732,
545
+ "step": 84
546
+ },
547
+ {
548
+ "epoch": 3.15,
549
+ "learning_rate": 6.909830056250527e-05,
550
+ "loss": 0.6144,
551
+ "step": 85
552
+ },
553
+ {
554
+ "epoch": 3.19,
555
+ "learning_rate": 6.671804554770135e-05,
556
+ "loss": 0.6812,
557
+ "step": 86
558
+ },
559
+ {
560
+ "epoch": 3.22,
561
+ "learning_rate": 6.435881212867493e-05,
562
+ "loss": 0.6753,
563
+ "step": 87
564
+ },
565
+ {
566
+ "epoch": 3.26,
567
+ "learning_rate": 6.20220904478199e-05,
568
+ "loss": 0.6341,
569
+ "step": 88
570
+ },
571
+ {
572
+ "epoch": 3.3,
573
+ "learning_rate": 5.9709356428633746e-05,
574
+ "loss": 0.6752,
575
+ "step": 89
576
+ },
577
+ {
578
+ "epoch": 3.33,
579
+ "learning_rate": 5.7422070843492734e-05,
580
+ "loss": 0.6995,
581
+ "step": 90
582
+ },
583
+ {
584
+ "epoch": 3.37,
585
+ "learning_rate": 5.5161678390996796e-05,
586
+ "loss": 0.6411,
587
+ "step": 91
588
+ },
589
+ {
590
+ "epoch": 3.41,
591
+ "learning_rate": 5.292960678346675e-05,
592
+ "loss": 0.6527,
593
+ "step": 92
594
+ },
595
+ {
596
+ "epoch": 3.44,
597
+ "learning_rate": 5.072726584517086e-05,
598
+ "loss": 0.7026,
599
+ "step": 93
600
+ },
601
+ {
602
+ "epoch": 3.48,
603
+ "learning_rate": 4.8556046621849346e-05,
604
+ "loss": 0.6603,
605
+ "step": 94
606
+ },
607
+ {
608
+ "epoch": 3.52,
609
+ "learning_rate": 4.6417320502100316e-05,
610
+ "loss": 0.6798,
611
+ "step": 95
612
+ },
613
+ {
614
+ "epoch": 3.56,
615
+ "learning_rate": 4.431243835118124e-05,
616
+ "loss": 0.623,
617
+ "step": 96
618
+ },
619
+ {
620
+ "epoch": 3.59,
621
+ "learning_rate": 4.224272965777326e-05,
622
+ "loss": 0.685,
623
+ "step": 97
624
+ },
625
+ {
626
+ "epoch": 3.63,
627
+ "learning_rate": 4.020950169424815e-05,
628
+ "loss": 0.7674,
629
+ "step": 98
630
+ },
631
+ {
632
+ "epoch": 3.67,
633
+ "learning_rate": 3.821403869096658e-05,
634
+ "loss": 0.7068,
635
+ "step": 99
636
+ },
637
+ {
638
+ "epoch": 3.7,
639
+ "learning_rate": 3.6257601025131026e-05,
640
+ "loss": 0.6724,
641
+ "step": 100
642
+ },
643
+ {
644
+ "epoch": 3.7,
645
+ "eval_loss": 0.811485767364502,
646
+ "eval_runtime": 2.6837,
647
+ "eval_samples_per_second": 1.118,
648
+ "eval_steps_per_second": 0.745,
649
+ "step": 100
650
+ },
651
+ {
652
+ "epoch": 3.74,
653
+ "learning_rate": 3.4341424424704375e-05,
654
+ "loss": 0.7169,
655
+ "step": 101
656
+ },
657
+ {
658
+ "epoch": 3.78,
659
+ "learning_rate": 3.246671918789755e-05,
660
+ "loss": 0.6499,
661
+ "step": 102
662
+ },
663
+ {
664
+ "epoch": 3.81,
665
+ "learning_rate": 3.063466941871952e-05,
666
+ "loss": 0.7342,
667
+ "step": 103
668
+ },
669
+ {
670
+ "epoch": 3.85,
671
+ "learning_rate": 2.8846432279071467e-05,
672
+ "loss": 0.6587,
673
+ "step": 104
674
+ },
675
+ {
676
+ "epoch": 3.89,
677
+ "learning_rate": 2.7103137257858868e-05,
678
+ "loss": 0.6042,
679
+ "step": 105
680
+ },
681
+ {
682
+ "epoch": 3.93,
683
+ "learning_rate": 2.540588545758179e-05,
684
+ "loss": 0.6507,
685
+ "step": 106
686
+ },
687
+ {
688
+ "epoch": 3.96,
689
+ "learning_rate": 2.37557488988552e-05,
690
+ "loss": 0.6646,
691
+ "step": 107
692
+ },
693
+ {
694
+ "epoch": 4.0,
695
+ "learning_rate": 2.2153769843297667e-05,
696
+ "loss": 0.6783,
697
+ "step": 108
698
+ }
699
+ ],
700
+ "logging_steps": 1,
701
+ "max_steps": 135,
702
+ "num_train_epochs": 5,
703
+ "save_steps": 500,
704
+ "total_flos": 1.310693833285632e+17,
705
+ "trial_name": null,
706
+ "trial_params": null
707
+ }
checkpoint-108/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3fa72f98f53ee315429aa7f3c33aaffdc7fad603ae51a2ace24ed2a24a41a0e
3
+ size 4027
checkpoint-135/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-135/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "NousResearch/Llama-2-13b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "up_proj",
18
+ "q_proj",
19
+ "down_proj",
20
+ "v_proj",
21
+ "o_proj",
22
+ "k_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-135/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b1d65bac0708cdfdd5a772caba6f0eb3e1f531ad8f6275eaa74ef0db869f070
3
+ size 500897101
checkpoint-135/adapter_model/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-135/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "NousResearch/Llama-2-13b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "up_proj",
18
+ "q_proj",
19
+ "down_proj",
20
+ "v_proj",
21
+ "o_proj",
22
+ "k_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-135/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b1d65bac0708cdfdd5a772caba6f0eb3e1f531ad8f6275eaa74ef0db869f070
3
+ size 500897101
checkpoint-135/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f3dc2eba524b4d7f69077932913a83b4aa78f7b1388688e85c4a7b032fe8b8b
3
+ size 1001736445
checkpoint-135/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35012476ddbb5f61baee6bd0d134b1deea5eb77266b2cfc6b2133b67e83a13eb
3
+ size 14575
checkpoint-135/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97c36aecfbf7504ea37f819fda29d815f122316316f280698a1fe25171e59f9f
3
+ size 627
checkpoint-135/trainer_state.json ADDED
@@ -0,0 +1,877 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 20,
6
+ "global_step": 135,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.9768,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.07,
19
+ "learning_rate": 4e-05,
20
+ "loss": 1.0553,
21
+ "step": 2
22
+ },
23
+ {
24
+ "epoch": 0.11,
25
+ "learning_rate": 6e-05,
26
+ "loss": 0.9074,
27
+ "step": 3
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 8e-05,
32
+ "loss": 1.0351,
33
+ "step": 4
34
+ },
35
+ {
36
+ "epoch": 0.19,
37
+ "learning_rate": 0.0001,
38
+ "loss": 0.9918,
39
+ "step": 5
40
+ },
41
+ {
42
+ "epoch": 0.22,
43
+ "learning_rate": 0.00012,
44
+ "loss": 0.9872,
45
+ "step": 6
46
+ },
47
+ {
48
+ "epoch": 0.26,
49
+ "learning_rate": 0.00014,
50
+ "loss": 0.9573,
51
+ "step": 7
52
+ },
53
+ {
54
+ "epoch": 0.3,
55
+ "learning_rate": 0.00016,
56
+ "loss": 1.0466,
57
+ "step": 8
58
+ },
59
+ {
60
+ "epoch": 0.33,
61
+ "learning_rate": 0.00018,
62
+ "loss": 0.8995,
63
+ "step": 9
64
+ },
65
+ {
66
+ "epoch": 0.37,
67
+ "learning_rate": 0.0002,
68
+ "loss": 0.9041,
69
+ "step": 10
70
+ },
71
+ {
72
+ "epoch": 0.41,
73
+ "learning_rate": 0.00019996841892833,
74
+ "loss": 0.936,
75
+ "step": 11
76
+ },
77
+ {
78
+ "epoch": 0.44,
79
+ "learning_rate": 0.00019987369566060176,
80
+ "loss": 0.8254,
81
+ "step": 12
82
+ },
83
+ {
84
+ "epoch": 0.48,
85
+ "learning_rate": 0.0001997158900260614,
86
+ "loss": 0.9508,
87
+ "step": 13
88
+ },
89
+ {
90
+ "epoch": 0.52,
91
+ "learning_rate": 0.00019949510169813003,
92
+ "loss": 0.929,
93
+ "step": 14
94
+ },
95
+ {
96
+ "epoch": 0.56,
97
+ "learning_rate": 0.0001992114701314478,
98
+ "loss": 0.9618,
99
+ "step": 15
100
+ },
101
+ {
102
+ "epoch": 0.59,
103
+ "learning_rate": 0.0001988651744737914,
104
+ "loss": 0.9317,
105
+ "step": 16
106
+ },
107
+ {
108
+ "epoch": 0.63,
109
+ "learning_rate": 0.00019845643345292054,
110
+ "loss": 0.9399,
111
+ "step": 17
112
+ },
113
+ {
114
+ "epoch": 0.67,
115
+ "learning_rate": 0.0001979855052384247,
116
+ "loss": 0.9377,
117
+ "step": 18
118
+ },
119
+ {
120
+ "epoch": 0.7,
121
+ "learning_rate": 0.00019745268727865774,
122
+ "loss": 0.9048,
123
+ "step": 19
124
+ },
125
+ {
126
+ "epoch": 0.74,
127
+ "learning_rate": 0.0001968583161128631,
128
+ "loss": 0.9311,
129
+ "step": 20
130
+ },
131
+ {
132
+ "epoch": 0.74,
133
+ "eval_loss": 0.8045752644538879,
134
+ "eval_runtime": 2.684,
135
+ "eval_samples_per_second": 1.118,
136
+ "eval_steps_per_second": 0.745,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.78,
141
+ "learning_rate": 0.0001962027671586086,
142
+ "loss": 0.9376,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.81,
147
+ "learning_rate": 0.00019548645447466431,
148
+ "loss": 0.8598,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.85,
153
+ "learning_rate": 0.00019470983049947444,
154
+ "loss": 0.991,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.89,
159
+ "learning_rate": 0.00019387338576538744,
160
+ "loss": 0.8472,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.93,
165
+ "learning_rate": 0.00019297764858882514,
166
+ "loss": 0.8818,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.96,
171
+ "learning_rate": 0.00019202318473658705,
172
+ "loss": 0.8879,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 1.0,
177
+ "learning_rate": 0.00019101059706849957,
178
+ "loss": 0.8483,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 1.04,
183
+ "learning_rate": 0.0001899405251566371,
184
+ "loss": 0.9505,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 1.07,
189
+ "learning_rate": 0.00018881364488135448,
190
+ "loss": 0.9116,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 1.11,
195
+ "learning_rate": 0.00018763066800438636,
196
+ "loss": 0.8575,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 1.15,
201
+ "learning_rate": 0.00018639234171928353,
202
+ "loss": 0.8093,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 1.19,
207
+ "learning_rate": 0.00018509944817946922,
208
+ "loss": 0.7966,
209
+ "step": 32
210
+ },
211
+ {
212
+ "epoch": 1.22,
213
+ "learning_rate": 0.0001837528040042142,
214
+ "loss": 0.8263,
215
+ "step": 33
216
+ },
217
+ {
218
+ "epoch": 1.26,
219
+ "learning_rate": 0.00018235325976284275,
220
+ "loss": 0.7951,
221
+ "step": 34
222
+ },
223
+ {
224
+ "epoch": 1.3,
225
+ "learning_rate": 0.00018090169943749476,
226
+ "loss": 0.849,
227
+ "step": 35
228
+ },
229
+ {
230
+ "epoch": 1.33,
231
+ "learning_rate": 0.00017939903986478355,
232
+ "loss": 0.863,
233
+ "step": 36
234
+ },
235
+ {
236
+ "epoch": 1.37,
237
+ "learning_rate": 0.00017784623015670238,
238
+ "loss": 0.8144,
239
+ "step": 37
240
+ },
241
+ {
242
+ "epoch": 1.41,
243
+ "learning_rate": 0.0001762442511011448,
244
+ "loss": 0.8078,
245
+ "step": 38
246
+ },
247
+ {
248
+ "epoch": 1.44,
249
+ "learning_rate": 0.00017459411454241822,
250
+ "loss": 0.7997,
251
+ "step": 39
252
+ },
253
+ {
254
+ "epoch": 1.48,
255
+ "learning_rate": 0.00017289686274214118,
256
+ "loss": 0.9322,
257
+ "step": 40
258
+ },
259
+ {
260
+ "epoch": 1.48,
261
+ "eval_loss": 0.7793169617652893,
262
+ "eval_runtime": 2.6811,
263
+ "eval_samples_per_second": 1.119,
264
+ "eval_steps_per_second": 0.746,
265
+ "step": 40
266
+ },
267
+ {
268
+ "epoch": 1.52,
269
+ "learning_rate": 0.00017115356772092857,
270
+ "loss": 0.8279,
271
+ "step": 41
272
+ },
273
+ {
274
+ "epoch": 1.56,
275
+ "learning_rate": 0.0001693653305812805,
276
+ "loss": 0.8759,
277
+ "step": 42
278
+ },
279
+ {
280
+ "epoch": 1.59,
281
+ "learning_rate": 0.00016753328081210245,
282
+ "loss": 0.8748,
283
+ "step": 43
284
+ },
285
+ {
286
+ "epoch": 1.63,
287
+ "learning_rate": 0.00016565857557529566,
288
+ "loss": 0.7638,
289
+ "step": 44
290
+ },
291
+ {
292
+ "epoch": 1.67,
293
+ "learning_rate": 0.000163742398974869,
294
+ "loss": 0.7941,
295
+ "step": 45
296
+ },
297
+ {
298
+ "epoch": 1.7,
299
+ "learning_rate": 0.00016178596130903344,
300
+ "loss": 0.8321,
301
+ "step": 46
302
+ },
303
+ {
304
+ "epoch": 1.74,
305
+ "learning_rate": 0.0001597904983057519,
306
+ "loss": 0.894,
307
+ "step": 47
308
+ },
309
+ {
310
+ "epoch": 1.78,
311
+ "learning_rate": 0.00015775727034222675,
312
+ "loss": 0.9176,
313
+ "step": 48
314
+ },
315
+ {
316
+ "epoch": 1.81,
317
+ "learning_rate": 0.00015568756164881882,
318
+ "loss": 0.8286,
319
+ "step": 49
320
+ },
321
+ {
322
+ "epoch": 1.85,
323
+ "learning_rate": 0.00015358267949789966,
324
+ "loss": 0.9328,
325
+ "step": 50
326
+ },
327
+ {
328
+ "epoch": 1.89,
329
+ "learning_rate": 0.00015144395337815064,
330
+ "loss": 0.8644,
331
+ "step": 51
332
+ },
333
+ {
334
+ "epoch": 1.93,
335
+ "learning_rate": 0.00014927273415482915,
336
+ "loss": 0.7769,
337
+ "step": 52
338
+ },
339
+ {
340
+ "epoch": 1.96,
341
+ "learning_rate": 0.0001470703932165333,
342
+ "loss": 0.8,
343
+ "step": 53
344
+ },
345
+ {
346
+ "epoch": 2.0,
347
+ "learning_rate": 0.00014483832160900326,
348
+ "loss": 0.7781,
349
+ "step": 54
350
+ },
351
+ {
352
+ "epoch": 2.04,
353
+ "learning_rate": 0.00014257792915650728,
354
+ "loss": 0.7852,
355
+ "step": 55
356
+ },
357
+ {
358
+ "epoch": 2.07,
359
+ "learning_rate": 0.00014029064357136628,
360
+ "loss": 0.7796,
361
+ "step": 56
362
+ },
363
+ {
364
+ "epoch": 2.11,
365
+ "learning_rate": 0.00013797790955218014,
366
+ "loss": 0.8287,
367
+ "step": 57
368
+ },
369
+ {
370
+ "epoch": 2.15,
371
+ "learning_rate": 0.00013564118787132506,
372
+ "loss": 0.6845,
373
+ "step": 58
374
+ },
375
+ {
376
+ "epoch": 2.19,
377
+ "learning_rate": 0.00013328195445229868,
378
+ "loss": 0.7821,
379
+ "step": 59
380
+ },
381
+ {
382
+ "epoch": 2.22,
383
+ "learning_rate": 0.00013090169943749476,
384
+ "loss": 0.708,
385
+ "step": 60
386
+ },
387
+ {
388
+ "epoch": 2.22,
389
+ "eval_loss": 0.7880761027336121,
390
+ "eval_runtime": 2.6843,
391
+ "eval_samples_per_second": 1.118,
392
+ "eval_steps_per_second": 0.745,
393
+ "step": 60
394
+ },
395
+ {
396
+ "epoch": 2.26,
397
+ "learning_rate": 0.0001285019262469976,
398
+ "loss": 0.8098,
399
+ "step": 61
400
+ },
401
+ {
402
+ "epoch": 2.3,
403
+ "learning_rate": 0.00012608415062898972,
404
+ "loss": 0.82,
405
+ "step": 62
406
+ },
407
+ {
408
+ "epoch": 2.33,
409
+ "learning_rate": 0.00012364989970237248,
410
+ "loss": 0.7187,
411
+ "step": 63
412
+ },
413
+ {
414
+ "epoch": 2.37,
415
+ "learning_rate": 0.00012120071099220549,
416
+ "loss": 0.7802,
417
+ "step": 64
418
+ },
419
+ {
420
+ "epoch": 2.41,
421
+ "learning_rate": 0.00011873813145857249,
422
+ "loss": 0.6834,
423
+ "step": 65
424
+ },
425
+ {
426
+ "epoch": 2.44,
427
+ "learning_rate": 0.00011626371651948838,
428
+ "loss": 0.6808,
429
+ "step": 66
430
+ },
431
+ {
432
+ "epoch": 2.48,
433
+ "learning_rate": 0.0001137790290684638,
434
+ "loss": 0.7881,
435
+ "step": 67
436
+ },
437
+ {
438
+ "epoch": 2.52,
439
+ "learning_rate": 0.00011128563848734816,
440
+ "loss": 0.7281,
441
+ "step": 68
442
+ },
443
+ {
444
+ "epoch": 2.56,
445
+ "learning_rate": 0.00010878511965507434,
446
+ "loss": 0.7231,
447
+ "step": 69
448
+ },
449
+ {
450
+ "epoch": 2.59,
451
+ "learning_rate": 0.00010627905195293135,
452
+ "loss": 0.6938,
453
+ "step": 70
454
+ },
455
+ {
456
+ "epoch": 2.63,
457
+ "learning_rate": 0.00010376901826699348,
458
+ "loss": 0.7633,
459
+ "step": 71
460
+ },
461
+ {
462
+ "epoch": 2.67,
463
+ "learning_rate": 0.00010125660398833528,
464
+ "loss": 0.8253,
465
+ "step": 72
466
+ },
467
+ {
468
+ "epoch": 2.7,
469
+ "learning_rate": 9.874339601166473e-05,
470
+ "loss": 0.8197,
471
+ "step": 73
472
+ },
473
+ {
474
+ "epoch": 2.74,
475
+ "learning_rate": 9.623098173300654e-05,
476
+ "loss": 0.7403,
477
+ "step": 74
478
+ },
479
+ {
480
+ "epoch": 2.78,
481
+ "learning_rate": 9.372094804706867e-05,
482
+ "loss": 0.8175,
483
+ "step": 75
484
+ },
485
+ {
486
+ "epoch": 2.81,
487
+ "learning_rate": 9.121488034492569e-05,
488
+ "loss": 0.7249,
489
+ "step": 76
490
+ },
491
+ {
492
+ "epoch": 2.85,
493
+ "learning_rate": 8.871436151265184e-05,
494
+ "loss": 0.7029,
495
+ "step": 77
496
+ },
497
+ {
498
+ "epoch": 2.89,
499
+ "learning_rate": 8.62209709315362e-05,
500
+ "loss": 0.8081,
501
+ "step": 78
502
+ },
503
+ {
504
+ "epoch": 2.93,
505
+ "learning_rate": 8.373628348051165e-05,
506
+ "loss": 0.7087,
507
+ "step": 79
508
+ },
509
+ {
510
+ "epoch": 2.96,
511
+ "learning_rate": 8.126186854142752e-05,
512
+ "loss": 0.762,
513
+ "step": 80
514
+ },
515
+ {
516
+ "epoch": 2.96,
517
+ "eval_loss": 0.7806326746940613,
518
+ "eval_runtime": 2.6841,
519
+ "eval_samples_per_second": 1.118,
520
+ "eval_steps_per_second": 0.745,
521
+ "step": 80
522
+ },
523
+ {
524
+ "epoch": 3.0,
525
+ "learning_rate": 7.879928900779456e-05,
526
+ "loss": 0.6724,
527
+ "step": 81
528
+ },
529
+ {
530
+ "epoch": 3.04,
531
+ "learning_rate": 7.635010029762756e-05,
532
+ "loss": 0.578,
533
+ "step": 82
534
+ },
535
+ {
536
+ "epoch": 3.07,
537
+ "learning_rate": 7.391584937101033e-05,
538
+ "loss": 0.6599,
539
+ "step": 83
540
+ },
541
+ {
542
+ "epoch": 3.11,
543
+ "learning_rate": 7.149807375300239e-05,
544
+ "loss": 0.732,
545
+ "step": 84
546
+ },
547
+ {
548
+ "epoch": 3.15,
549
+ "learning_rate": 6.909830056250527e-05,
550
+ "loss": 0.6144,
551
+ "step": 85
552
+ },
553
+ {
554
+ "epoch": 3.19,
555
+ "learning_rate": 6.671804554770135e-05,
556
+ "loss": 0.6812,
557
+ "step": 86
558
+ },
559
+ {
560
+ "epoch": 3.22,
561
+ "learning_rate": 6.435881212867493e-05,
562
+ "loss": 0.6753,
563
+ "step": 87
564
+ },
565
+ {
566
+ "epoch": 3.26,
567
+ "learning_rate": 6.20220904478199e-05,
568
+ "loss": 0.6341,
569
+ "step": 88
570
+ },
571
+ {
572
+ "epoch": 3.3,
573
+ "learning_rate": 5.9709356428633746e-05,
574
+ "loss": 0.6752,
575
+ "step": 89
576
+ },
577
+ {
578
+ "epoch": 3.33,
579
+ "learning_rate": 5.7422070843492734e-05,
580
+ "loss": 0.6995,
581
+ "step": 90
582
+ },
583
+ {
584
+ "epoch": 3.37,
585
+ "learning_rate": 5.5161678390996796e-05,
586
+ "loss": 0.6411,
587
+ "step": 91
588
+ },
589
+ {
590
+ "epoch": 3.41,
591
+ "learning_rate": 5.292960678346675e-05,
592
+ "loss": 0.6527,
593
+ "step": 92
594
+ },
595
+ {
596
+ "epoch": 3.44,
597
+ "learning_rate": 5.072726584517086e-05,
598
+ "loss": 0.7026,
599
+ "step": 93
600
+ },
601
+ {
602
+ "epoch": 3.48,
603
+ "learning_rate": 4.8556046621849346e-05,
604
+ "loss": 0.6603,
605
+ "step": 94
606
+ },
607
+ {
608
+ "epoch": 3.52,
609
+ "learning_rate": 4.6417320502100316e-05,
610
+ "loss": 0.6798,
611
+ "step": 95
612
+ },
613
+ {
614
+ "epoch": 3.56,
615
+ "learning_rate": 4.431243835118124e-05,
616
+ "loss": 0.623,
617
+ "step": 96
618
+ },
619
+ {
620
+ "epoch": 3.59,
621
+ "learning_rate": 4.224272965777326e-05,
622
+ "loss": 0.685,
623
+ "step": 97
624
+ },
625
+ {
626
+ "epoch": 3.63,
627
+ "learning_rate": 4.020950169424815e-05,
628
+ "loss": 0.7674,
629
+ "step": 98
630
+ },
631
+ {
632
+ "epoch": 3.67,
633
+ "learning_rate": 3.821403869096658e-05,
634
+ "loss": 0.7068,
635
+ "step": 99
636
+ },
637
+ {
638
+ "epoch": 3.7,
639
+ "learning_rate": 3.6257601025131026e-05,
640
+ "loss": 0.6724,
641
+ "step": 100
642
+ },
643
+ {
644
+ "epoch": 3.7,
645
+ "eval_loss": 0.811485767364502,
646
+ "eval_runtime": 2.6837,
647
+ "eval_samples_per_second": 1.118,
648
+ "eval_steps_per_second": 0.745,
649
+ "step": 100
650
+ },
651
+ {
652
+ "epoch": 3.74,
653
+ "learning_rate": 3.4341424424704375e-05,
654
+ "loss": 0.7169,
655
+ "step": 101
656
+ },
657
+ {
658
+ "epoch": 3.78,
659
+ "learning_rate": 3.246671918789755e-05,
660
+ "loss": 0.6499,
661
+ "step": 102
662
+ },
663
+ {
664
+ "epoch": 3.81,
665
+ "learning_rate": 3.063466941871952e-05,
666
+ "loss": 0.7342,
667
+ "step": 103
668
+ },
669
+ {
670
+ "epoch": 3.85,
671
+ "learning_rate": 2.8846432279071467e-05,
672
+ "loss": 0.6587,
673
+ "step": 104
674
+ },
675
+ {
676
+ "epoch": 3.89,
677
+ "learning_rate": 2.7103137257858868e-05,
678
+ "loss": 0.6042,
679
+ "step": 105
680
+ },
681
+ {
682
+ "epoch": 3.93,
683
+ "learning_rate": 2.540588545758179e-05,
684
+ "loss": 0.6507,
685
+ "step": 106
686
+ },
687
+ {
688
+ "epoch": 3.96,
689
+ "learning_rate": 2.37557488988552e-05,
690
+ "loss": 0.6646,
691
+ "step": 107
692
+ },
693
+ {
694
+ "epoch": 4.0,
695
+ "learning_rate": 2.2153769843297667e-05,
696
+ "loss": 0.6783,
697
+ "step": 108
698
+ },
699
+ {
700
+ "epoch": 4.04,
701
+ "learning_rate": 2.0600960135216462e-05,
702
+ "loss": 0.6036,
703
+ "step": 109
704
+ },
705
+ {
706
+ "epoch": 4.07,
707
+ "learning_rate": 1.9098300562505266e-05,
708
+ "loss": 0.6631,
709
+ "step": 110
710
+ },
711
+ {
712
+ "epoch": 4.11,
713
+ "learning_rate": 1.7646740237157256e-05,
714
+ "loss": 0.5743,
715
+ "step": 111
716
+ },
717
+ {
718
+ "epoch": 4.15,
719
+ "learning_rate": 1.6247195995785837e-05,
720
+ "loss": 0.6115,
721
+ "step": 112
722
+ },
723
+ {
724
+ "epoch": 4.19,
725
+ "learning_rate": 1.4900551820530828e-05,
726
+ "loss": 0.6387,
727
+ "step": 113
728
+ },
729
+ {
730
+ "epoch": 4.22,
731
+ "learning_rate": 1.3607658280716473e-05,
732
+ "loss": 0.6785,
733
+ "step": 114
734
+ },
735
+ {
736
+ "epoch": 4.26,
737
+ "learning_rate": 1.2369331995613665e-05,
738
+ "loss": 0.6049,
739
+ "step": 115
740
+ },
741
+ {
742
+ "epoch": 4.3,
743
+ "learning_rate": 1.1186355118645554e-05,
744
+ "loss": 0.5745,
745
+ "step": 116
746
+ },
747
+ {
748
+ "epoch": 4.33,
749
+ "learning_rate": 1.0059474843362892e-05,
750
+ "loss": 0.5218,
751
+ "step": 117
752
+ },
753
+ {
754
+ "epoch": 4.37,
755
+ "learning_rate": 8.989402931500434e-06,
756
+ "loss": 0.5492,
757
+ "step": 118
758
+ },
759
+ {
760
+ "epoch": 4.41,
761
+ "learning_rate": 7.976815263412963e-06,
762
+ "loss": 0.6314,
763
+ "step": 119
764
+ },
765
+ {
766
+ "epoch": 4.44,
767
+ "learning_rate": 7.022351411174866e-06,
768
+ "loss": 0.6404,
769
+ "step": 120
770
+ },
771
+ {
772
+ "epoch": 4.44,
773
+ "eval_loss": 0.830319344997406,
774
+ "eval_runtime": 2.6841,
775
+ "eval_samples_per_second": 1.118,
776
+ "eval_steps_per_second": 0.745,
777
+ "step": 120
778
+ },
779
+ {
780
+ "epoch": 4.48,
781
+ "learning_rate": 6.126614234612593e-06,
782
+ "loss": 0.6818,
783
+ "step": 121
784
+ },
785
+ {
786
+ "epoch": 4.52,
787
+ "learning_rate": 5.290169500525577e-06,
788
+ "loss": 0.6437,
789
+ "step": 122
790
+ },
791
+ {
792
+ "epoch": 4.56,
793
+ "learning_rate": 4.513545525335705e-06,
794
+ "loss": 0.6208,
795
+ "step": 123
796
+ },
797
+ {
798
+ "epoch": 4.59,
799
+ "learning_rate": 3.797232841391407e-06,
800
+ "loss": 0.5599,
801
+ "step": 124
802
+ },
803
+ {
804
+ "epoch": 4.63,
805
+ "learning_rate": 3.1416838871368924e-06,
806
+ "loss": 0.6858,
807
+ "step": 125
808
+ },
809
+ {
810
+ "epoch": 4.67,
811
+ "learning_rate": 2.5473127213422763e-06,
812
+ "loss": 0.7139,
813
+ "step": 126
814
+ },
815
+ {
816
+ "epoch": 4.7,
817
+ "learning_rate": 2.014494761575314e-06,
818
+ "loss": 0.6138,
819
+ "step": 127
820
+ },
821
+ {
822
+ "epoch": 4.74,
823
+ "learning_rate": 1.543566547079467e-06,
824
+ "loss": 0.631,
825
+ "step": 128
826
+ },
827
+ {
828
+ "epoch": 4.78,
829
+ "learning_rate": 1.134825526208605e-06,
830
+ "loss": 0.5295,
831
+ "step": 129
832
+ },
833
+ {
834
+ "epoch": 4.81,
835
+ "learning_rate": 7.885298685522235e-07,
836
+ "loss": 0.6071,
837
+ "step": 130
838
+ },
839
+ {
840
+ "epoch": 4.85,
841
+ "learning_rate": 5.048983018699827e-07,
842
+ "loss": 0.665,
843
+ "step": 131
844
+ },
845
+ {
846
+ "epoch": 4.89,
847
+ "learning_rate": 2.841099739386066e-07,
848
+ "loss": 0.6381,
849
+ "step": 132
850
+ },
851
+ {
852
+ "epoch": 4.93,
853
+ "learning_rate": 1.2630433939825327e-07,
854
+ "loss": 0.6353,
855
+ "step": 133
856
+ },
857
+ {
858
+ "epoch": 4.96,
859
+ "learning_rate": 3.1581071670006015e-08,
860
+ "loss": 0.7291,
861
+ "step": 134
862
+ },
863
+ {
864
+ "epoch": 5.0,
865
+ "learning_rate": 0.0,
866
+ "loss": 0.6235,
867
+ "step": 135
868
+ }
869
+ ],
870
+ "logging_steps": 1,
871
+ "max_steps": 135,
872
+ "num_train_epochs": 5,
873
+ "save_steps": 500,
874
+ "total_flos": 1.638301223755776e+17,
875
+ "trial_name": null,
876
+ "trial_params": null
877
+ }
checkpoint-135/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3fa72f98f53ee315429aa7f3c33aaffdc7fad603ae51a2ace24ed2a24a41a0e
3
+ size 4027
checkpoint-81/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-81/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "NousResearch/Llama-2-13b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "up_proj",
18
+ "q_proj",
19
+ "down_proj",
20
+ "v_proj",
21
+ "o_proj",
22
+ "k_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-81/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9006af1558669882c79f3b51000264efe56fc81ed4d8d124e1a98a546d77dc4b
3
+ size 500897101
checkpoint-81/adapter_model/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-81/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "NousResearch/Llama-2-13b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "up_proj",
18
+ "q_proj",
19
+ "down_proj",
20
+ "v_proj",
21
+ "o_proj",
22
+ "k_proj",
23
+ "gate_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-81/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9006af1558669882c79f3b51000264efe56fc81ed4d8d124e1a98a546d77dc4b
3
+ size 500897101
checkpoint-81/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5347cf654d9e8d3f5e151cfc120cc94a8e20a6b6dd140d9960adf23fc84b1f12
3
+ size 1001736445
checkpoint-81/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9ee6701aa7c579a9c04a715a3a84fc255d21cd8d1bf96ee29b80699ddfbab21
3
+ size 14575
checkpoint-81/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f759a02ac2ef9d7ceb83d271d45437e99d9c2488e28aeabfb053ed036de56c8
3
+ size 627
checkpoint-81/trainer_state.json ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 20,
6
+ "global_step": 81,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.9768,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.07,
19
+ "learning_rate": 4e-05,
20
+ "loss": 1.0553,
21
+ "step": 2
22
+ },
23
+ {
24
+ "epoch": 0.11,
25
+ "learning_rate": 6e-05,
26
+ "loss": 0.9074,
27
+ "step": 3
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 8e-05,
32
+ "loss": 1.0351,
33
+ "step": 4
34
+ },
35
+ {
36
+ "epoch": 0.19,
37
+ "learning_rate": 0.0001,
38
+ "loss": 0.9918,
39
+ "step": 5
40
+ },
41
+ {
42
+ "epoch": 0.22,
43
+ "learning_rate": 0.00012,
44
+ "loss": 0.9872,
45
+ "step": 6
46
+ },
47
+ {
48
+ "epoch": 0.26,
49
+ "learning_rate": 0.00014,
50
+ "loss": 0.9573,
51
+ "step": 7
52
+ },
53
+ {
54
+ "epoch": 0.3,
55
+ "learning_rate": 0.00016,
56
+ "loss": 1.0466,
57
+ "step": 8
58
+ },
59
+ {
60
+ "epoch": 0.33,
61
+ "learning_rate": 0.00018,
62
+ "loss": 0.8995,
63
+ "step": 9
64
+ },
65
+ {
66
+ "epoch": 0.37,
67
+ "learning_rate": 0.0002,
68
+ "loss": 0.9041,
69
+ "step": 10
70
+ },
71
+ {
72
+ "epoch": 0.41,
73
+ "learning_rate": 0.00019996841892833,
74
+ "loss": 0.936,
75
+ "step": 11
76
+ },
77
+ {
78
+ "epoch": 0.44,
79
+ "learning_rate": 0.00019987369566060176,
80
+ "loss": 0.8254,
81
+ "step": 12
82
+ },
83
+ {
84
+ "epoch": 0.48,
85
+ "learning_rate": 0.0001997158900260614,
86
+ "loss": 0.9508,
87
+ "step": 13
88
+ },
89
+ {
90
+ "epoch": 0.52,
91
+ "learning_rate": 0.00019949510169813003,
92
+ "loss": 0.929,
93
+ "step": 14
94
+ },
95
+ {
96
+ "epoch": 0.56,
97
+ "learning_rate": 0.0001992114701314478,
98
+ "loss": 0.9618,
99
+ "step": 15
100
+ },
101
+ {
102
+ "epoch": 0.59,
103
+ "learning_rate": 0.0001988651744737914,
104
+ "loss": 0.9317,
105
+ "step": 16
106
+ },
107
+ {
108
+ "epoch": 0.63,
109
+ "learning_rate": 0.00019845643345292054,
110
+ "loss": 0.9399,
111
+ "step": 17
112
+ },
113
+ {
114
+ "epoch": 0.67,
115
+ "learning_rate": 0.0001979855052384247,
116
+ "loss": 0.9377,
117
+ "step": 18
118
+ },
119
+ {
120
+ "epoch": 0.7,
121
+ "learning_rate": 0.00019745268727865774,
122
+ "loss": 0.9048,
123
+ "step": 19
124
+ },
125
+ {
126
+ "epoch": 0.74,
127
+ "learning_rate": 0.0001968583161128631,
128
+ "loss": 0.9311,
129
+ "step": 20
130
+ },
131
+ {
132
+ "epoch": 0.74,
133
+ "eval_loss": 0.8045752644538879,
134
+ "eval_runtime": 2.684,
135
+ "eval_samples_per_second": 1.118,
136
+ "eval_steps_per_second": 0.745,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.78,
141
+ "learning_rate": 0.0001962027671586086,
142
+ "loss": 0.9376,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.81,
147
+ "learning_rate": 0.00019548645447466431,
148
+ "loss": 0.8598,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.85,
153
+ "learning_rate": 0.00019470983049947444,
154
+ "loss": 0.991,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.89,
159
+ "learning_rate": 0.00019387338576538744,
160
+ "loss": 0.8472,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.93,
165
+ "learning_rate": 0.00019297764858882514,
166
+ "loss": 0.8818,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.96,
171
+ "learning_rate": 0.00019202318473658705,
172
+ "loss": 0.8879,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 1.0,
177
+ "learning_rate": 0.00019101059706849957,
178
+ "loss": 0.8483,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 1.04,
183
+ "learning_rate": 0.0001899405251566371,
184
+ "loss": 0.9505,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 1.07,
189
+ "learning_rate": 0.00018881364488135448,
190
+ "loss": 0.9116,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 1.11,
195
+ "learning_rate": 0.00018763066800438636,
196
+ "loss": 0.8575,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 1.15,
201
+ "learning_rate": 0.00018639234171928353,
202
+ "loss": 0.8093,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 1.19,
207
+ "learning_rate": 0.00018509944817946922,
208
+ "loss": 0.7966,
209
+ "step": 32
210
+ },
211
+ {
212
+ "epoch": 1.22,
213
+ "learning_rate": 0.0001837528040042142,
214
+ "loss": 0.8263,
215
+ "step": 33
216
+ },
217
+ {
218
+ "epoch": 1.26,
219
+ "learning_rate": 0.00018235325976284275,
220
+ "loss": 0.7951,
221
+ "step": 34
222
+ },
223
+ {
224
+ "epoch": 1.3,
225
+ "learning_rate": 0.00018090169943749476,
226
+ "loss": 0.849,
227
+ "step": 35
228
+ },
229
+ {
230
+ "epoch": 1.33,
231
+ "learning_rate": 0.00017939903986478355,
232
+ "loss": 0.863,
233
+ "step": 36
234
+ },
235
+ {
236
+ "epoch": 1.37,
237
+ "learning_rate": 0.00017784623015670238,
238
+ "loss": 0.8144,
239
+ "step": 37
240
+ },
241
+ {
242
+ "epoch": 1.41,
243
+ "learning_rate": 0.0001762442511011448,
244
+ "loss": 0.8078,
245
+ "step": 38
246
+ },
247
+ {
248
+ "epoch": 1.44,
249
+ "learning_rate": 0.00017459411454241822,
250
+ "loss": 0.7997,
251
+ "step": 39
252
+ },
253
+ {
254
+ "epoch": 1.48,
255
+ "learning_rate": 0.00017289686274214118,
256
+ "loss": 0.9322,
257
+ "step": 40
258
+ },
259
+ {
260
+ "epoch": 1.48,
261
+ "eval_loss": 0.7793169617652893,
262
+ "eval_runtime": 2.6811,
263
+ "eval_samples_per_second": 1.119,
264
+ "eval_steps_per_second": 0.746,
265
+ "step": 40
266
+ },
267
+ {
268
+ "epoch": 1.52,
269
+ "learning_rate": 0.00017115356772092857,
270
+ "loss": 0.8279,
271
+ "step": 41
272
+ },
273
+ {
274
+ "epoch": 1.56,
275
+ "learning_rate": 0.0001693653305812805,
276
+ "loss": 0.8759,
277
+ "step": 42
278
+ },
279
+ {
280
+ "epoch": 1.59,
281
+ "learning_rate": 0.00016753328081210245,
282
+ "loss": 0.8748,
283
+ "step": 43
284
+ },
285
+ {
286
+ "epoch": 1.63,
287
+ "learning_rate": 0.00016565857557529566,
288
+ "loss": 0.7638,
289
+ "step": 44
290
+ },
291
+ {
292
+ "epoch": 1.67,
293
+ "learning_rate": 0.000163742398974869,
294
+ "loss": 0.7941,
295
+ "step": 45
296
+ },
297
+ {
298
+ "epoch": 1.7,
299
+ "learning_rate": 0.00016178596130903344,
300
+ "loss": 0.8321,
301
+ "step": 46
302
+ },
303
+ {
304
+ "epoch": 1.74,
305
+ "learning_rate": 0.0001597904983057519,
306
+ "loss": 0.894,
307
+ "step": 47
308
+ },
309
+ {
310
+ "epoch": 1.78,
311
+ "learning_rate": 0.00015775727034222675,
312
+ "loss": 0.9176,
313
+ "step": 48
314
+ },
315
+ {
316
+ "epoch": 1.81,
317
+ "learning_rate": 0.00015568756164881882,
318
+ "loss": 0.8286,
319
+ "step": 49
320
+ },
321
+ {
322
+ "epoch": 1.85,
323
+ "learning_rate": 0.00015358267949789966,
324
+ "loss": 0.9328,
325
+ "step": 50
326
+ },
327
+ {
328
+ "epoch": 1.89,
329
+ "learning_rate": 0.00015144395337815064,
330
+ "loss": 0.8644,
331
+ "step": 51
332
+ },
333
+ {
334
+ "epoch": 1.93,
335
+ "learning_rate": 0.00014927273415482915,
336
+ "loss": 0.7769,
337
+ "step": 52
338
+ },
339
+ {
340
+ "epoch": 1.96,
341
+ "learning_rate": 0.0001470703932165333,
342
+ "loss": 0.8,
343
+ "step": 53
344
+ },
345
+ {
346
+ "epoch": 2.0,
347
+ "learning_rate": 0.00014483832160900326,
348
+ "loss": 0.7781,
349
+ "step": 54
350
+ },
351
+ {
352
+ "epoch": 2.04,
353
+ "learning_rate": 0.00014257792915650728,
354
+ "loss": 0.7852,
355
+ "step": 55
356
+ },
357
+ {
358
+ "epoch": 2.07,
359
+ "learning_rate": 0.00014029064357136628,
360
+ "loss": 0.7796,
361
+ "step": 56
362
+ },
363
+ {
364
+ "epoch": 2.11,
365
+ "learning_rate": 0.00013797790955218014,
366
+ "loss": 0.8287,
367
+ "step": 57
368
+ },
369
+ {
370
+ "epoch": 2.15,
371
+ "learning_rate": 0.00013564118787132506,
372
+ "loss": 0.6845,
373
+ "step": 58
374
+ },
375
+ {
376
+ "epoch": 2.19,
377
+ "learning_rate": 0.00013328195445229868,
378
+ "loss": 0.7821,
379
+ "step": 59
380
+ },
381
+ {
382
+ "epoch": 2.22,
383
+ "learning_rate": 0.00013090169943749476,
384
+ "loss": 0.708,
385
+ "step": 60
386
+ },
387
+ {
388
+ "epoch": 2.22,
389
+ "eval_loss": 0.7880761027336121,
390
+ "eval_runtime": 2.6843,
391
+ "eval_samples_per_second": 1.118,
392
+ "eval_steps_per_second": 0.745,
393
+ "step": 60
394
+ },
395
+ {
396
+ "epoch": 2.26,
397
+ "learning_rate": 0.0001285019262469976,
398
+ "loss": 0.8098,
399
+ "step": 61
400
+ },
401
+ {
402
+ "epoch": 2.3,
403
+ "learning_rate": 0.00012608415062898972,
404
+ "loss": 0.82,
405
+ "step": 62
406
+ },
407
+ {
408
+ "epoch": 2.33,
409
+ "learning_rate": 0.00012364989970237248,
410
+ "loss": 0.7187,
411
+ "step": 63
412
+ },
413
+ {
414
+ "epoch": 2.37,
415
+ "learning_rate": 0.00012120071099220549,
416
+ "loss": 0.7802,
417
+ "step": 64
418
+ },
419
+ {
420
+ "epoch": 2.41,
421
+ "learning_rate": 0.00011873813145857249,
422
+ "loss": 0.6834,
423
+ "step": 65
424
+ },
425
+ {
426
+ "epoch": 2.44,
427
+ "learning_rate": 0.00011626371651948838,
428
+ "loss": 0.6808,
429
+ "step": 66
430
+ },
431
+ {
432
+ "epoch": 2.48,
433
+ "learning_rate": 0.0001137790290684638,
434
+ "loss": 0.7881,
435
+ "step": 67
436
+ },
437
+ {
438
+ "epoch": 2.52,
439
+ "learning_rate": 0.00011128563848734816,
440
+ "loss": 0.7281,
441
+ "step": 68
442
+ },
443
+ {
444
+ "epoch": 2.56,
445
+ "learning_rate": 0.00010878511965507434,
446
+ "loss": 0.7231,
447
+ "step": 69
448
+ },
449
+ {
450
+ "epoch": 2.59,
451
+ "learning_rate": 0.00010627905195293135,
452
+ "loss": 0.6938,
453
+ "step": 70
454
+ },
455
+ {
456
+ "epoch": 2.63,
457
+ "learning_rate": 0.00010376901826699348,
458
+ "loss": 0.7633,
459
+ "step": 71
460
+ },
461
+ {
462
+ "epoch": 2.67,
463
+ "learning_rate": 0.00010125660398833528,
464
+ "loss": 0.8253,
465
+ "step": 72
466
+ },
467
+ {
468
+ "epoch": 2.7,
469
+ "learning_rate": 9.874339601166473e-05,
470
+ "loss": 0.8197,
471
+ "step": 73
472
+ },
473
+ {
474
+ "epoch": 2.74,
475
+ "learning_rate": 9.623098173300654e-05,
476
+ "loss": 0.7403,
477
+ "step": 74
478
+ },
479
+ {
480
+ "epoch": 2.78,
481
+ "learning_rate": 9.372094804706867e-05,
482
+ "loss": 0.8175,
483
+ "step": 75
484
+ },
485
+ {
486
+ "epoch": 2.81,
487
+ "learning_rate": 9.121488034492569e-05,
488
+ "loss": 0.7249,
489
+ "step": 76
490
+ },
491
+ {
492
+ "epoch": 2.85,
493
+ "learning_rate": 8.871436151265184e-05,
494
+ "loss": 0.7029,
495
+ "step": 77
496
+ },
497
+ {
498
+ "epoch": 2.89,
499
+ "learning_rate": 8.62209709315362e-05,
500
+ "loss": 0.8081,
501
+ "step": 78
502
+ },
503
+ {
504
+ "epoch": 2.93,
505
+ "learning_rate": 8.373628348051165e-05,
506
+ "loss": 0.7087,
507
+ "step": 79
508
+ },
509
+ {
510
+ "epoch": 2.96,
511
+ "learning_rate": 8.126186854142752e-05,
512
+ "loss": 0.762,
513
+ "step": 80
514
+ },
515
+ {
516
+ "epoch": 2.96,
517
+ "eval_loss": 0.7806326746940613,
518
+ "eval_runtime": 2.6841,
519
+ "eval_samples_per_second": 1.118,
520
+ "eval_steps_per_second": 0.745,
521
+ "step": 80
522
+ },
523
+ {
524
+ "epoch": 3.0,
525
+ "learning_rate": 7.879928900779456e-05,
526
+ "loss": 0.6724,
527
+ "step": 81
528
+ }
529
+ ],
530
+ "logging_steps": 1,
531
+ "max_steps": 135,
532
+ "num_train_epochs": 5,
533
+ "save_steps": 500,
534
+ "total_flos": 9.83564041740288e+16,
535
+ "trial_name": null,
536
+ "trial_params": null
537
+ }
checkpoint-81/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3fa72f98f53ee315429aa7f3c33aaffdc7fad603ae51a2ace24ed2a24a41a0e
3
+ size 4027