winglian commited on
Commit
bd8a262
·
verified ·
1 Parent(s): 2b9aa5e

End of training

Browse files
README.md CHANGED
@@ -50,8 +50,8 @@ sequence_len: 2048
50
  sample_packing: false
51
  pad_to_sequence_len: false
52
 
53
- lora_r: 128
54
- lora_alpha: 64
55
  lora_dropout: 0.05
56
  lora_target_linear: true
57
  lora_modules_to_save:
@@ -69,11 +69,11 @@ wandb_project: dpo-zephyr-deita-nectar
69
  wandb_entity: oaaic
70
  wandb_watch:
71
  wandb_run_id:
72
- wandb_name: kto-3ep-v2b
73
  wandb_log_model:
74
 
75
  gradient_accumulation_steps: 1
76
- micro_batch_size: 4
77
  num_epochs: 3
78
  optimizer: paged_adamw_8bit
79
  adam_beta2: 0.95
@@ -139,17 +139,17 @@ More information needed
139
 
140
  The following hyperparameters were used during training:
141
  - learning_rate: 1e-05
142
- - train_batch_size: 4
143
  - eval_batch_size: 8
144
  - seed: 42
145
  - distributed_type: multi-GPU
146
  - num_devices: 4
147
- - total_train_batch_size: 16
148
  - total_eval_batch_size: 32
149
  - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-08
150
  - lr_scheduler_type: linear
151
  - lr_scheduler_warmup_steps: 10
152
- - training_steps: 1615
153
 
154
  ### Training results
155
 
 
50
  sample_packing: false
51
  pad_to_sequence_len: false
52
 
53
+ lora_r: 256
54
+ lora_alpha: 128
55
  lora_dropout: 0.05
56
  lora_target_linear: true
57
  lora_modules_to_save:
 
69
  wandb_entity: oaaic
70
  wandb_watch:
71
  wandb_run_id:
72
+ wandb_name: kto-3ep-v3-r256
73
  wandb_log_model:
74
 
75
  gradient_accumulation_steps: 1
76
+ micro_batch_size: 2
77
  num_epochs: 3
78
  optimizer: paged_adamw_8bit
79
  adam_beta2: 0.95
 
139
 
140
  The following hyperparameters were used during training:
141
  - learning_rate: 1e-05
142
+ - train_batch_size: 2
143
  - eval_batch_size: 8
144
  - seed: 42
145
  - distributed_type: multi-GPU
146
  - num_devices: 4
147
+ - total_train_batch_size: 8
148
  - total_eval_batch_size: 32
149
  - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-08
150
  - lr_scheduler_type: linear
151
  - lr_scheduler_warmup_steps: 10
152
+ - training_steps: 3230
153
 
154
  ### Training results
155
 
adapter_config.json CHANGED
@@ -9,23 +9,23 @@
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
- "lora_alpha": 64,
13
  "lora_dropout": 0.05,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
17
  "peft_type": "LORA",
18
- "r": 128,
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
  "o_proj",
 
23
  "down_proj",
24
- "v_proj",
25
  "up_proj",
26
- "k_proj",
27
- "q_proj",
28
- "gate_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
+ "lora_alpha": 128,
13
  "lora_dropout": 0.05,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
17
  "peft_type": "LORA",
18
+ "r": 256,
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
  "o_proj",
23
+ "q_proj",
24
  "down_proj",
 
25
  "up_proj",
26
+ "gate_proj",
27
+ "v_proj",
28
+ "k_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:016529bf26408ff55a13f1fc2cee3784838109b70227b72df6768c68a26d340b
3
- size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e197c0323741596d10f600eb6ba058defb470f2491756a549f6930771e5deb70
3
+ size 1342239008
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee258947db852c631def2386ec1c820a61ce50169128a3b5071c2ea627f1eb0e
3
- size 4924995664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a3c6b5f84edcc9460d29155b6f11844d43a2badb26292b071ea03765c28f1af
3
+ size 4917096408
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72a24b39fbf1d809046f1c1b5c38e6459b2300dc01f36aeb55bf467781c524d3
3
- size 4993711256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdc23002d59a5df08a6244df334b1958ae78b426d43a5ebe94a0af2ad8a7b587
3
+ size 4968544640
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfeb5faef09ade5e6833d5afdd456eec9fa622689ccca4eef9befc5842580351
3
- size 4974328272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dba996654afa2bc8637f3fb3e8edc8c5ce7c804b9a607dd299a258602588aaf
3
+ size 4946490800
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:600ae732306b6a4440d9c035ab824c3383128f9a86106a403888d758aa02ce6b
3
- size 262144128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b71ef791b8de8c7cc7fea4c1d68df265468f25718fade06a8558a12e6967bd92
3
+ size 994136032
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 15155085312
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00004-of-00004.safetensors",
@@ -62,18 +62,18 @@
62
  "model.layers.10.mlp.up_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
63
  "model.layers.10.mlp.up_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
64
  "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
65
- "model.layers.10.self_attn.k_proj.base_layer.weight": "model-00001-of-00004.safetensors",
66
- "model.layers.10.self_attn.k_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
67
- "model.layers.10.self_attn.k_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
68
- "model.layers.10.self_attn.o_proj.base_layer.weight": "model-00001-of-00004.safetensors",
69
- "model.layers.10.self_attn.o_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
70
- "model.layers.10.self_attn.o_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
71
- "model.layers.10.self_attn.q_proj.base_layer.weight": "model-00001-of-00004.safetensors",
72
- "model.layers.10.self_attn.q_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
73
- "model.layers.10.self_attn.q_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
74
- "model.layers.10.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
75
- "model.layers.10.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
76
- "model.layers.10.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
77
  "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
78
  "model.layers.11.mlp.down_proj.base_layer.weight": "model-00002-of-00004.safetensors",
79
  "model.layers.11.mlp.down_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
@@ -304,23 +304,23 @@
304
  "model.layers.2.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
305
  "model.layers.2.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
306
  "model.layers.2.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
307
- "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
308
- "model.layers.20.mlp.down_proj.base_layer.weight": "model-00002-of-00004.safetensors",
309
- "model.layers.20.mlp.down_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
310
- "model.layers.20.mlp.down_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
311
- "model.layers.20.mlp.gate_proj.base_layer.weight": "model-00002-of-00004.safetensors",
312
- "model.layers.20.mlp.gate_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
313
- "model.layers.20.mlp.gate_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
314
- "model.layers.20.mlp.up_proj.base_layer.weight": "model-00002-of-00004.safetensors",
315
- "model.layers.20.mlp.up_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
316
- "model.layers.20.mlp.up_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
317
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
318
  "model.layers.20.self_attn.k_proj.base_layer.weight": "model-00002-of-00004.safetensors",
319
  "model.layers.20.self_attn.k_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
320
  "model.layers.20.self_attn.k_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
321
- "model.layers.20.self_attn.o_proj.base_layer.weight": "model-00002-of-00004.safetensors",
322
- "model.layers.20.self_attn.o_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
323
- "model.layers.20.self_attn.o_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
324
  "model.layers.20.self_attn.q_proj.base_layer.weight": "model-00002-of-00004.safetensors",
325
  "model.layers.20.self_attn.q_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
326
  "model.layers.20.self_attn.q_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
@@ -338,18 +338,18 @@
338
  "model.layers.21.mlp.up_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
339
  "model.layers.21.mlp.up_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
340
  "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
341
- "model.layers.21.self_attn.k_proj.base_layer.weight": "model-00002-of-00004.safetensors",
342
- "model.layers.21.self_attn.k_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
343
- "model.layers.21.self_attn.k_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
344
  "model.layers.21.self_attn.o_proj.base_layer.weight": "model-00003-of-00004.safetensors",
345
  "model.layers.21.self_attn.o_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
346
  "model.layers.21.self_attn.o_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
347
- "model.layers.21.self_attn.q_proj.base_layer.weight": "model-00002-of-00004.safetensors",
348
- "model.layers.21.self_attn.q_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
349
- "model.layers.21.self_attn.q_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
350
- "model.layers.21.self_attn.v_proj.base_layer.weight": "model-00002-of-00004.safetensors",
351
- "model.layers.21.self_attn.v_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
352
- "model.layers.21.self_attn.v_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
353
  "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
354
  "model.layers.22.mlp.down_proj.base_layer.weight": "model-00003-of-00004.safetensors",
355
  "model.layers.22.mlp.down_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
@@ -557,17 +557,17 @@
557
  "model.layers.3.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
558
  "model.layers.3.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
559
  "model.layers.3.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
560
- "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
561
- "model.layers.30.mlp.down_proj.base_layer.weight": "model-00003-of-00004.safetensors",
562
- "model.layers.30.mlp.down_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
563
- "model.layers.30.mlp.down_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
564
  "model.layers.30.mlp.gate_proj.base_layer.weight": "model-00003-of-00004.safetensors",
565
  "model.layers.30.mlp.gate_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
566
  "model.layers.30.mlp.gate_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
567
- "model.layers.30.mlp.up_proj.base_layer.weight": "model-00003-of-00004.safetensors",
568
- "model.layers.30.mlp.up_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
569
- "model.layers.30.mlp.up_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
570
- "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
571
  "model.layers.30.self_attn.k_proj.base_layer.weight": "model-00003-of-00004.safetensors",
572
  "model.layers.30.self_attn.k_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
573
  "model.layers.30.self_attn.k_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
@@ -580,29 +580,29 @@
580
  "model.layers.30.self_attn.v_proj.base_layer.weight": "model-00003-of-00004.safetensors",
581
  "model.layers.30.self_attn.v_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
582
  "model.layers.30.self_attn.v_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
583
- "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
584
- "model.layers.31.mlp.down_proj.base_layer.weight": "model-00003-of-00004.safetensors",
585
- "model.layers.31.mlp.down_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
586
- "model.layers.31.mlp.down_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
587
- "model.layers.31.mlp.gate_proj.base_layer.weight": "model-00003-of-00004.safetensors",
588
- "model.layers.31.mlp.gate_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
589
- "model.layers.31.mlp.gate_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
590
- "model.layers.31.mlp.up_proj.base_layer.weight": "model-00003-of-00004.safetensors",
591
- "model.layers.31.mlp.up_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
592
- "model.layers.31.mlp.up_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
593
- "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
594
- "model.layers.31.self_attn.k_proj.base_layer.weight": "model-00003-of-00004.safetensors",
595
- "model.layers.31.self_attn.k_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
596
- "model.layers.31.self_attn.k_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
597
- "model.layers.31.self_attn.o_proj.base_layer.weight": "model-00003-of-00004.safetensors",
598
- "model.layers.31.self_attn.o_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
599
- "model.layers.31.self_attn.o_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
600
- "model.layers.31.self_attn.q_proj.base_layer.weight": "model-00003-of-00004.safetensors",
601
- "model.layers.31.self_attn.q_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
602
- "model.layers.31.self_attn.q_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
603
- "model.layers.31.self_attn.v_proj.base_layer.weight": "model-00003-of-00004.safetensors",
604
- "model.layers.31.self_attn.v_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
605
- "model.layers.31.self_attn.v_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
606
  "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
607
  "model.layers.4.mlp.down_proj.base_layer.weight": "model-00001-of-00004.safetensors",
608
  "model.layers.4.mlp.down_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
@@ -718,17 +718,17 @@
718
  "model.layers.8.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
719
  "model.layers.8.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
720
  "model.layers.8.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
721
- "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
722
- "model.layers.9.mlp.down_proj.base_layer.weight": "model-00001-of-00004.safetensors",
723
- "model.layers.9.mlp.down_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
724
- "model.layers.9.mlp.down_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
725
  "model.layers.9.mlp.gate_proj.base_layer.weight": "model-00001-of-00004.safetensors",
726
  "model.layers.9.mlp.gate_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
727
  "model.layers.9.mlp.gate_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
728
  "model.layers.9.mlp.up_proj.base_layer.weight": "model-00001-of-00004.safetensors",
729
  "model.layers.9.mlp.up_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
730
  "model.layers.9.mlp.up_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
731
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
732
  "model.layers.9.self_attn.k_proj.base_layer.weight": "model-00001-of-00004.safetensors",
733
  "model.layers.9.self_attn.k_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
734
  "model.layers.9.self_attn.k_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
@@ -741,6 +741,6 @@
741
  "model.layers.9.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
742
  "model.layers.9.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
743
  "model.layers.9.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
744
- "model.norm.weight": "model-00003-of-00004.safetensors"
745
  }
746
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 15826173952
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00004-of-00004.safetensors",
 
62
  "model.layers.10.mlp.up_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
63
  "model.layers.10.mlp.up_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
64
  "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.10.self_attn.k_proj.base_layer.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.10.self_attn.k_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.10.self_attn.k_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.10.self_attn.o_proj.base_layer.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.10.self_attn.o_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.10.self_attn.o_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.10.self_attn.q_proj.base_layer.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.10.self_attn.q_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.10.self_attn.q_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.10.self_attn.v_proj.base_layer.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.10.self_attn.v_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.10.self_attn.v_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
77
  "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
78
  "model.layers.11.mlp.down_proj.base_layer.weight": "model-00002-of-00004.safetensors",
79
  "model.layers.11.mlp.down_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
 
304
  "model.layers.2.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
305
  "model.layers.2.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
306
  "model.layers.2.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
307
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.20.mlp.down_proj.base_layer.weight": "model-00003-of-00004.safetensors",
309
+ "model.layers.20.mlp.down_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.20.mlp.down_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.20.mlp.gate_proj.base_layer.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.20.mlp.gate_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.20.mlp.gate_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.20.mlp.up_proj.base_layer.weight": "model-00003-of-00004.safetensors",
315
+ "model.layers.20.mlp.up_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.20.mlp.up_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
318
  "model.layers.20.self_attn.k_proj.base_layer.weight": "model-00002-of-00004.safetensors",
319
  "model.layers.20.self_attn.k_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
320
  "model.layers.20.self_attn.k_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.20.self_attn.o_proj.base_layer.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.20.self_attn.o_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.20.self_attn.o_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
324
  "model.layers.20.self_attn.q_proj.base_layer.weight": "model-00002-of-00004.safetensors",
325
  "model.layers.20.self_attn.q_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
326
  "model.layers.20.self_attn.q_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
 
338
  "model.layers.21.mlp.up_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
339
  "model.layers.21.mlp.up_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
340
  "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.21.self_attn.k_proj.base_layer.weight": "model-00003-of-00004.safetensors",
342
+ "model.layers.21.self_attn.k_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
343
+ "model.layers.21.self_attn.k_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
344
  "model.layers.21.self_attn.o_proj.base_layer.weight": "model-00003-of-00004.safetensors",
345
  "model.layers.21.self_attn.o_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
346
  "model.layers.21.self_attn.o_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
347
+ "model.layers.21.self_attn.q_proj.base_layer.weight": "model-00003-of-00004.safetensors",
348
+ "model.layers.21.self_attn.q_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
349
+ "model.layers.21.self_attn.q_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
350
+ "model.layers.21.self_attn.v_proj.base_layer.weight": "model-00003-of-00004.safetensors",
351
+ "model.layers.21.self_attn.v_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
352
+ "model.layers.21.self_attn.v_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
353
  "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
354
  "model.layers.22.mlp.down_proj.base_layer.weight": "model-00003-of-00004.safetensors",
355
  "model.layers.22.mlp.down_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
 
557
  "model.layers.3.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
558
  "model.layers.3.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
559
  "model.layers.3.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
560
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
561
+ "model.layers.30.mlp.down_proj.base_layer.weight": "model-00004-of-00004.safetensors",
562
+ "model.layers.30.mlp.down_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
563
+ "model.layers.30.mlp.down_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
564
  "model.layers.30.mlp.gate_proj.base_layer.weight": "model-00003-of-00004.safetensors",
565
  "model.layers.30.mlp.gate_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
566
  "model.layers.30.mlp.gate_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
567
+ "model.layers.30.mlp.up_proj.base_layer.weight": "model-00004-of-00004.safetensors",
568
+ "model.layers.30.mlp.up_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
569
+ "model.layers.30.mlp.up_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
570
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
571
  "model.layers.30.self_attn.k_proj.base_layer.weight": "model-00003-of-00004.safetensors",
572
  "model.layers.30.self_attn.k_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
573
  "model.layers.30.self_attn.k_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
 
580
  "model.layers.30.self_attn.v_proj.base_layer.weight": "model-00003-of-00004.safetensors",
581
  "model.layers.30.self_attn.v_proj.lora_A.default.weight": "model-00003-of-00004.safetensors",
582
  "model.layers.30.self_attn.v_proj.lora_B.default.weight": "model-00003-of-00004.safetensors",
583
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
584
+ "model.layers.31.mlp.down_proj.base_layer.weight": "model-00004-of-00004.safetensors",
585
+ "model.layers.31.mlp.down_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
586
+ "model.layers.31.mlp.down_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
587
+ "model.layers.31.mlp.gate_proj.base_layer.weight": "model-00004-of-00004.safetensors",
588
+ "model.layers.31.mlp.gate_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
589
+ "model.layers.31.mlp.gate_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
590
+ "model.layers.31.mlp.up_proj.base_layer.weight": "model-00004-of-00004.safetensors",
591
+ "model.layers.31.mlp.up_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
592
+ "model.layers.31.mlp.up_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
593
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
594
+ "model.layers.31.self_attn.k_proj.base_layer.weight": "model-00004-of-00004.safetensors",
595
+ "model.layers.31.self_attn.k_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
596
+ "model.layers.31.self_attn.k_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
597
+ "model.layers.31.self_attn.o_proj.base_layer.weight": "model-00004-of-00004.safetensors",
598
+ "model.layers.31.self_attn.o_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
599
+ "model.layers.31.self_attn.o_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
600
+ "model.layers.31.self_attn.q_proj.base_layer.weight": "model-00004-of-00004.safetensors",
601
+ "model.layers.31.self_attn.q_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
602
+ "model.layers.31.self_attn.q_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
603
+ "model.layers.31.self_attn.v_proj.base_layer.weight": "model-00004-of-00004.safetensors",
604
+ "model.layers.31.self_attn.v_proj.lora_A.default.weight": "model-00004-of-00004.safetensors",
605
+ "model.layers.31.self_attn.v_proj.lora_B.default.weight": "model-00004-of-00004.safetensors",
606
  "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
607
  "model.layers.4.mlp.down_proj.base_layer.weight": "model-00001-of-00004.safetensors",
608
  "model.layers.4.mlp.down_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
 
718
  "model.layers.8.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
719
  "model.layers.8.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
720
  "model.layers.8.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
721
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
722
+ "model.layers.9.mlp.down_proj.base_layer.weight": "model-00002-of-00004.safetensors",
723
+ "model.layers.9.mlp.down_proj.lora_A.default.weight": "model-00002-of-00004.safetensors",
724
+ "model.layers.9.mlp.down_proj.lora_B.default.weight": "model-00002-of-00004.safetensors",
725
  "model.layers.9.mlp.gate_proj.base_layer.weight": "model-00001-of-00004.safetensors",
726
  "model.layers.9.mlp.gate_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
727
  "model.layers.9.mlp.gate_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
728
  "model.layers.9.mlp.up_proj.base_layer.weight": "model-00001-of-00004.safetensors",
729
  "model.layers.9.mlp.up_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
730
  "model.layers.9.mlp.up_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
731
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
732
  "model.layers.9.self_attn.k_proj.base_layer.weight": "model-00001-of-00004.safetensors",
733
  "model.layers.9.self_attn.k_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
734
  "model.layers.9.self_attn.k_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
 
741
  "model.layers.9.self_attn.v_proj.base_layer.weight": "model-00001-of-00004.safetensors",
742
  "model.layers.9.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00004.safetensors",
743
  "model.layers.9.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00004.safetensors",
744
+ "model.norm.weight": "model-00004-of-00004.safetensors"
745
  }
746
  }
runs/Jan13_23-59-25_b5637b3f66b9/events.out.tfevents.1705190375.b5637b3f66b9.92446.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8874f412db9c87f35d8ed512bcb8b617d88a7fa8ef0146836c21207630f29e23
3
+ size 4830
runs/Jan14_00-01-54_b5637b3f66b9/events.out.tfevents.1705190524.b5637b3f66b9.97198.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae5e1074485ffd1bf08978beaf2541d44a0ca0ab9aedf3adb717b974d6278eb
3
+ size 5453
runs/Jan14_00-16-13_b5637b3f66b9/events.out.tfevents.1705191389.b5637b3f66b9.101961.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:870e01f6fd1473e804334e6163263ac048b9245808ef14d5523d9bbdeda2c9ae
3
+ size 2051221
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c069c1a18c8aa647d08eab1ab70961843cfe9f48f19c1cd99bb1a4d5319fcc70
3
  size 4283
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f35dd2da756435a461f292de7f41f8f794f5bb87c3b8cf175f6493702e2c93f1
3
  size 4283