chrlu commited on
Commit
c3b5e59
·
verified ·
1 Parent(s): c0563d9

Model save

Browse files
README.md CHANGED
@@ -1,16 +1,11 @@
1
  ---
2
  license: other
3
- base_model: HuggingFaceH4/zephyr-7b-gemma-sft-v0.1
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - dpo
8
- - generated_from_trainer
9
  - trl
10
  - dpo
 
11
  - generated_from_trainer
12
- datasets:
13
- - argilla/dpo-mix-7k
14
  model-index:
15
  - name: zephyr-7b-gemma-dpo
16
  results: []
@@ -21,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # zephyr-7b-gemma-dpo
23
 
24
- This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-gemma-sft-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-sft-v0.1) on the argilla/dpo-mix-7k dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.4683
27
- - Rewards/chosen: -3.0221
28
- - Rewards/rejected: -4.6813
29
- - Rewards/accuracies: 0.7708
30
- - Rewards/margins: 1.6592
31
- - Logps/rejected: -453.7782
32
- - Logps/chosen: -423.6228
33
- - Logits/rejected: 90.9822
34
- - Logits/chosen: 96.7158
35
 
36
  ## Model description
37
 
@@ -68,7 +63,7 @@ The following hyperparameters were used during training:
68
 
69
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
70
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
71
- | 0.1591 | 1.8957 | 100 | 0.4692 | -2.9810 | -4.6451 | 0.7604 | 1.6641 | -453.0530 | -422.7995 | 91.0574 | 96.7661 |
72
 
73
 
74
  ### Framework versions
 
1
  ---
2
  license: other
3
+ base_model: Columbia-NLP/gemma-2b-zephyr-sft
4
  tags:
 
 
 
 
5
  - trl
6
  - dpo
7
+ - alignment-handbook
8
  - generated_from_trainer
 
 
9
  model-index:
10
  - name: zephyr-7b-gemma-dpo
11
  results: []
 
16
 
17
  # zephyr-7b-gemma-dpo
18
 
19
+ This model is a fine-tuned version of [Columbia-NLP/gemma-2b-zephyr-sft](https://huggingface.co/Columbia-NLP/gemma-2b-zephyr-sft) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.5972
22
+ - Rewards/chosen: 0.3534
23
+ - Rewards/rejected: 0.1096
24
+ - Rewards/accuracies: 0.6771
25
+ - Rewards/margins: 0.2437
26
+ - Logps/rejected: -373.4094
27
+ - Logps/chosen: -372.0067
28
+ - Logits/rejected: -12.6654
29
+ - Logits/chosen: -13.2926
30
 
31
  ## Model description
32
 
 
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.5732 | 1.8957 | 100 | 0.5972 | 0.3534 | 0.1096 | 0.6771 | 0.2437 | -373.4094 | -372.0067 | -12.6654 | -13.2926 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
  "epoch": 1.971563981042654,
3
- "eval_logits/chosen": 96.71578216552734,
4
- "eval_logits/rejected": 90.98221588134766,
5
- "eval_logps/chosen": -423.6227722167969,
6
- "eval_logps/rejected": -453.7782287597656,
7
- "eval_loss": 0.468290776014328,
8
- "eval_rewards/accuracies": 0.7708333134651184,
9
- "eval_rewards/chosen": -3.0221338272094727,
10
- "eval_rewards/margins": 1.6591955423355103,
11
- "eval_rewards/rejected": -4.681329727172852,
12
- "eval_runtime": 58.6185,
13
  "eval_samples": 750,
14
- "eval_samples_per_second": 12.795,
15
- "eval_steps_per_second": 0.409,
16
  "total_flos": 0.0,
17
- "train_loss": 0.3921648321243433,
18
- "train_runtime": 1190.3032,
19
  "train_samples": 6750,
20
- "train_samples_per_second": 11.342,
21
- "train_steps_per_second": 0.087
22
  }
 
1
  {
2
  "epoch": 1.971563981042654,
3
+ "eval_logits/chosen": 96.0078125,
4
+ "eval_logits/rejected": 90.1099853515625,
5
+ "eval_logps/chosen": -434.6161193847656,
6
+ "eval_logps/rejected": -466.3072814941406,
7
+ "eval_loss": 0.4578173756599426,
8
+ "eval_rewards/accuracies": 0.7604166865348816,
9
+ "eval_rewards/chosen": -3.5473945140838623,
10
+ "eval_rewards/margins": 1.6644223928451538,
11
+ "eval_rewards/rejected": -5.211816787719727,
12
+ "eval_runtime": 124.9704,
13
  "eval_samples": 750,
14
+ "eval_samples_per_second": 6.001,
15
+ "eval_steps_per_second": 0.192,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.627926590350958,
18
+ "train_runtime": 756.4701,
19
  "train_samples": 6750,
20
+ "train_samples_per_second": 17.846,
21
+ "train_steps_per_second": 0.137
22
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1",
3
  "architectures": [
4
  "GemmaForCausalLM"
5
  ],
@@ -10,20 +10,20 @@
10
  "head_dim": 256,
11
  "hidden_act": "gelu",
12
  "hidden_activation": null,
13
- "hidden_size": 3072,
14
  "initializer_range": 0.02,
15
- "intermediate_size": 24576,
16
  "max_position_embeddings": 8192,
17
  "model_type": "gemma",
18
- "num_attention_heads": 16,
19
- "num_hidden_layers": 28,
20
- "num_key_value_heads": 16,
21
  "pad_token_id": 0,
22
  "rms_norm_eps": 1e-06,
23
  "rope_scaling": null,
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.40.1",
27
- "use_cache": true,
28
  "vocab_size": 256000
29
  }
 
1
  {
2
+ "_name_or_path": "Columbia-NLP/gemma-2b-zephyr-sft",
3
  "architectures": [
4
  "GemmaForCausalLM"
5
  ],
 
10
  "head_dim": 256,
11
  "hidden_act": "gelu",
12
  "hidden_activation": null,
13
+ "hidden_size": 2048,
14
  "initializer_range": 0.02,
15
+ "intermediate_size": 16384,
16
  "max_position_embeddings": 8192,
17
  "model_type": "gemma",
18
+ "num_attention_heads": 8,
19
+ "num_hidden_layers": 18,
20
+ "num_key_value_heads": 1,
21
  "pad_token_id": 0,
22
  "rms_norm_eps": 1e-06,
23
  "rope_scaling": null,
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.40.1",
27
+ "use_cache": false,
28
  "vocab_size": 256000
29
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 1.971563981042654,
3
- "eval_logits/chosen": 96.71578216552734,
4
- "eval_logits/rejected": 90.98221588134766,
5
- "eval_logps/chosen": -423.6227722167969,
6
- "eval_logps/rejected": -453.7782287597656,
7
- "eval_loss": 0.468290776014328,
8
- "eval_rewards/accuracies": 0.7708333134651184,
9
- "eval_rewards/chosen": -3.0221338272094727,
10
- "eval_rewards/margins": 1.6591955423355103,
11
- "eval_rewards/rejected": -4.681329727172852,
12
- "eval_runtime": 58.6185,
13
  "eval_samples": 750,
14
- "eval_samples_per_second": 12.795,
15
- "eval_steps_per_second": 0.409
16
  }
 
1
  {
2
  "epoch": 1.971563981042654,
3
+ "eval_logits/chosen": 96.0078125,
4
+ "eval_logits/rejected": 90.1099853515625,
5
+ "eval_logps/chosen": -434.6161193847656,
6
+ "eval_logps/rejected": -466.3072814941406,
7
+ "eval_loss": 0.4578173756599426,
8
+ "eval_rewards/accuracies": 0.7604166865348816,
9
+ "eval_rewards/chosen": -3.5473945140838623,
10
+ "eval_rewards/margins": 1.6644223928451538,
11
+ "eval_rewards/rejected": -5.211816787719727,
12
+ "eval_runtime": 124.9704,
13
  "eval_samples": 750,
14
+ "eval_samples_per_second": 6.001,
15
+ "eval_steps_per_second": 0.192
16
  }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afe8c7b38eceaa5e7aeae3e932e2354f0dea3d47e873adf9bc1c35c4e2da99f2
3
+ size 4945242264
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d618cb86564d9ea380614a6c12b636c502fb5ff187be1cb9426352eb6e895f51
3
+ size 67121608
model.safetensors.index.json CHANGED
@@ -1,261 +1,171 @@
1
  {
2
  "metadata": {
3
- "total_size": 17075361792
4
  },
5
  "weight_map": {
6
- "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
7
- "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
8
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
9
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
10
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
11
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
12
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
13
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
14
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
15
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
16
- "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
17
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
18
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
19
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
20
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
21
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
22
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
23
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
24
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
25
- "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
26
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
27
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
28
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
29
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
30
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
31
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
32
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
33
- "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
34
- "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
35
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
36
- "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
37
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
38
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
39
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
- "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
43
- "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
44
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
45
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
46
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
47
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
48
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
49
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
50
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
51
- "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
52
- "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
53
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
54
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
55
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
56
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
57
- "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
58
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
59
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
60
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
61
- "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
62
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
63
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
64
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
65
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
66
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
67
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
68
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
69
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
70
- "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
71
- "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
72
- "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
73
- "model.layers.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
74
- "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
75
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
79
- "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
80
- "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
81
- "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
82
- "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
83
- "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
84
- "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
85
- "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
86
- "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
87
- "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
88
- "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
89
- "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
90
- "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
91
- "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
92
- "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
93
- "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
94
- "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
95
- "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
96
- "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
97
- "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
98
- "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
99
- "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
100
- "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
101
- "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
102
- "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
103
- "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
104
- "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
105
- "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
106
- "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
107
- "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
108
- "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
109
- "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
110
- "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
111
- "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
112
- "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
113
- "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
114
- "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
115
- "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
116
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
117
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
118
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
119
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
120
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
121
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
122
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
123
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
124
- "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
125
- "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
126
- "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
127
- "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
128
- "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
129
- "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
130
- "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
131
- "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
132
- "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
133
- "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
134
- "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
135
- "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
136
- "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
137
- "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
138
- "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
139
- "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
140
- "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
141
- "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
142
- "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
143
- "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
144
- "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
145
- "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
146
- "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
147
- "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
- "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
- "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
- "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
151
- "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
152
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
153
- "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
154
- "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
155
- "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
156
- "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
157
- "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
158
- "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
159
- "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
160
- "model.layers.24.input_layernorm.weight": "model-00004-of-00004.safetensors",
161
- "model.layers.24.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
162
- "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
163
- "model.layers.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
164
- "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
165
- "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
166
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
167
- "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
168
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
169
- "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
170
- "model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
171
- "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
172
- "model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
173
- "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
174
- "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
175
- "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
176
- "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
177
- "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
178
- "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
179
- "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
180
- "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
181
- "model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
182
- "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
183
- "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
184
- "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
185
- "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
186
- "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
187
- "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
188
- "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
189
- "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
190
- "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
191
- "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
192
- "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
193
- "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
194
- "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
195
- "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
196
- "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
197
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
198
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
199
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
200
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
201
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
202
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
203
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
204
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
205
- "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
206
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
207
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
208
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
209
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
210
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
211
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
212
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
213
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
214
- "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
215
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
216
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
217
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
218
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
219
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
220
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
221
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
222
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
223
- "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
224
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
225
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
226
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
227
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
228
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
229
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
230
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
231
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
232
- "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
233
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
234
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
235
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
236
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
237
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
238
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
239
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
240
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
241
- "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
242
- "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
243
- "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
244
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
245
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
246
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
247
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
248
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
249
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
250
- "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
251
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
252
- "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
253
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
254
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
255
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
256
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
257
- "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
258
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
259
- "model.norm.weight": "model-00004-of-00004.safetensors"
260
  }
261
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 5012344832
4
  },
5
  "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
89
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
90
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
93
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.norm.weight": "model-00002-of-00002.safetensors"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  }
171
  }
runs/Apr27_14-56-02_660111d13776/events.out.tfevents.1714226380.660111d13776.25904.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:405d736d5884f25ea2c9a0384d1867c1f511cb06a4e1d8b83f1c865f03e3e449
3
+ size 13438
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.971563981042654,
3
  "total_flos": 0.0,
4
- "train_loss": 0.3921648321243433,
5
- "train_runtime": 1190.3032,
6
  "train_samples": 6750,
7
- "train_samples_per_second": 11.342,
8
- "train_steps_per_second": 0.087
9
  }
 
1
  {
2
  "epoch": 1.971563981042654,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.627926590350958,
5
+ "train_runtime": 756.4701,
6
  "train_samples": 6750,
7
+ "train_samples_per_second": 17.846,
8
+ "train_steps_per_second": 0.137
9
  }
trainer_state.json CHANGED
@@ -10,12 +10,12 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.018957345971563982,
13
- "grad_norm": 133.64647421295854,
14
  "learning_rate": 4.545454545454545e-08,
15
- "logits/chosen": 117.4909439086914,
16
- "logits/rejected": 126.8502426147461,
17
- "logps/chosen": -338.3250732421875,
18
- "logps/rejected": -438.210205078125,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -25,178 +25,178 @@
25
  },
26
  {
27
  "epoch": 0.1895734597156398,
28
- "grad_norm": 137.17714765050428,
29
  "learning_rate": 4.545454545454545e-07,
30
- "logits/chosen": 135.0139923095703,
31
- "logits/rejected": 138.361328125,
32
- "logps/chosen": -397.126220703125,
33
- "logps/rejected": -439.42083740234375,
34
- "loss": 0.7143,
35
- "rewards/accuracies": 0.3888888955116272,
36
- "rewards/chosen": -0.02168009988963604,
37
- "rewards/margins": -0.04445798695087433,
38
- "rewards/rejected": 0.02277788519859314,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.3791469194312796,
43
- "grad_norm": 125.18497680199994,
44
  "learning_rate": 4.885348141000122e-07,
45
- "logits/chosen": 121.53276062011719,
46
- "logits/rejected": 125.26307678222656,
47
- "logps/chosen": -370.91107177734375,
48
- "logps/rejected": -423.38677978515625,
49
- "loss": 0.6418,
50
- "rewards/accuracies": 0.5874999761581421,
51
- "rewards/chosen": 0.08632902055978775,
52
- "rewards/margins": 0.28587669134140015,
53
- "rewards/rejected": -0.199547678232193,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.5687203791469194,
58
- "grad_norm": 115.98151525592598,
59
  "learning_rate": 4.5025027361734613e-07,
60
- "logits/chosen": 142.67178344726562,
61
- "logits/rejected": 136.16537475585938,
62
- "logps/chosen": -415.01104736328125,
63
- "logps/rejected": -460.3519592285156,
64
- "loss": 0.5673,
65
- "rewards/accuracies": 0.668749988079071,
66
- "rewards/chosen": -1.1033741235733032,
67
- "rewards/margins": 0.8893669843673706,
68
- "rewards/rejected": -1.9927412271499634,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.7582938388625592,
73
- "grad_norm": 112.04677171325864,
74
  "learning_rate": 3.893311157806091e-07,
75
- "logits/chosen": 124.56459045410156,
76
- "logits/rejected": 113.08979797363281,
77
- "logps/chosen": -391.10174560546875,
78
- "logps/rejected": -417.09051513671875,
79
- "loss": 0.5586,
80
- "rewards/accuracies": 0.71875,
81
- "rewards/chosen": -1.8466203212738037,
82
- "rewards/margins": 1.0624934434890747,
83
- "rewards/rejected": -2.909113883972168,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.9478672985781991,
88
- "grad_norm": 126.0267199667638,
89
  "learning_rate": 3.126631330646801e-07,
90
- "logits/chosen": 138.824462890625,
91
- "logits/rejected": 142.9259033203125,
92
- "logps/chosen": -455.6646423339844,
93
- "logps/rejected": -536.987548828125,
94
- "loss": 0.4941,
95
- "rewards/accuracies": 0.731249988079071,
96
- "rewards/chosen": -1.8371152877807617,
97
- "rewards/margins": 1.2058273553848267,
98
- "rewards/rejected": -3.042942523956299,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 1.1374407582938388,
103
- "grad_norm": 63.02753605606795,
104
  "learning_rate": 2.2891223348923882e-07,
105
- "logits/chosen": 131.37802124023438,
106
- "logits/rejected": 134.72222900390625,
107
- "logps/chosen": -436.17047119140625,
108
- "logps/rejected": -520.2355346679688,
109
- "loss": 0.3078,
110
- "rewards/accuracies": 0.918749988079071,
111
- "rewards/chosen": -1.9126123189926147,
112
- "rewards/margins": 2.285891056060791,
113
- "rewards/rejected": -4.198503017425537,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 1.3270142180094786,
118
- "grad_norm": 59.15589622996558,
119
  "learning_rate": 1.4754491880085317e-07,
120
- "logits/chosen": 124.51689147949219,
121
- "logits/rejected": 126.70524597167969,
122
- "logps/chosen": -402.62066650390625,
123
- "logps/rejected": -505.50006103515625,
124
- "loss": 0.1932,
125
- "rewards/accuracies": 0.9312499761581421,
126
- "rewards/chosen": -1.7549495697021484,
127
- "rewards/margins": 2.8932533264160156,
128
- "rewards/rejected": -4.648203372955322,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 1.5165876777251186,
133
- "grad_norm": 44.24206971141979,
134
  "learning_rate": 7.775827023107834e-08,
135
- "logits/chosen": 111.74947357177734,
136
- "logits/rejected": 128.5332489013672,
137
- "logps/chosen": -400.06146240234375,
138
- "logps/rejected": -519.4473876953125,
139
- "loss": 0.1687,
140
- "rewards/accuracies": 0.9624999761581421,
141
- "rewards/chosen": -2.326984167098999,
142
- "rewards/margins": 2.937407970428467,
143
- "rewards/rejected": -5.264392375946045,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 1.7061611374407581,
148
- "grad_norm": 50.07584592888485,
149
  "learning_rate": 2.7440387297912122e-08,
150
- "logits/chosen": 110.84814453125,
151
- "logits/rejected": 123.78230285644531,
152
- "logps/chosen": -435.03265380859375,
153
- "logps/rejected": -550.7723388671875,
154
- "loss": 0.1579,
155
- "rewards/accuracies": 0.9750000238418579,
156
- "rewards/chosen": -2.400338649749756,
157
- "rewards/margins": 3.238767623901367,
158
- "rewards/rejected": -5.639105796813965,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 1.8957345971563981,
163
- "grad_norm": 49.65552371508206,
164
  "learning_rate": 2.27878296044029e-09,
165
- "logits/chosen": 117.1094970703125,
166
- "logits/rejected": 117.060302734375,
167
- "logps/chosen": -427.23431396484375,
168
- "logps/rejected": -520.2066650390625,
169
- "loss": 0.1591,
170
- "rewards/accuracies": 0.949999988079071,
171
- "rewards/chosen": -2.2608113288879395,
172
- "rewards/margins": 2.9113571643829346,
173
- "rewards/rejected": -5.172169208526611,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 1.8957345971563981,
178
- "eval_logits/chosen": 96.76607513427734,
179
- "eval_logits/rejected": 91.05736541748047,
180
- "eval_logps/chosen": -422.7994689941406,
181
- "eval_logps/rejected": -453.052978515625,
182
- "eval_loss": 0.4691648781299591,
183
- "eval_rewards/accuracies": 0.7604166865348816,
184
- "eval_rewards/chosen": -2.9809672832489014,
185
- "eval_rewards/margins": 1.6640973091125488,
186
- "eval_rewards/rejected": -4.645064353942871,
187
- "eval_runtime": 56.9732,
188
- "eval_samples_per_second": 13.164,
189
- "eval_steps_per_second": 0.421,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 1.971563981042654,
194
  "step": 104,
195
  "total_flos": 0.0,
196
- "train_loss": 0.3921648321243433,
197
- "train_runtime": 1190.3032,
198
- "train_samples_per_second": 11.342,
199
- "train_steps_per_second": 0.087
200
  }
201
  ],
202
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.018957345971563982,
13
+ "grad_norm": 15.786988646394411,
14
  "learning_rate": 4.545454545454545e-08,
15
+ "logits/chosen": -13.905267715454102,
16
+ "logits/rejected": -14.118387222290039,
17
+ "logps/chosen": -350.8895263671875,
18
+ "logps/rejected": -446.6286926269531,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.1895734597156398,
28
+ "grad_norm": 15.908099576913655,
29
  "learning_rate": 4.545454545454545e-07,
30
+ "logits/chosen": -14.040081024169922,
31
+ "logits/rejected": -14.157392501831055,
32
+ "logps/chosen": -416.2701416015625,
33
+ "logps/rejected": -449.4697265625,
34
+ "loss": 0.693,
35
+ "rewards/accuracies": 0.5138888955116272,
36
+ "rewards/chosen": 0.0006088384543545544,
37
+ "rewards/margins": 0.008209776133298874,
38
+ "rewards/rejected": -0.007600938435643911,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.3791469194312796,
43
+ "grad_norm": 14.229474825008781,
44
  "learning_rate": 4.885348141000122e-07,
45
+ "logits/chosen": -13.39338207244873,
46
+ "logits/rejected": -13.542058944702148,
47
+ "logps/chosen": -392.9753723144531,
48
+ "logps/rejected": -427.68096923828125,
49
+ "loss": 0.6892,
50
+ "rewards/accuracies": 0.5062500238418579,
51
+ "rewards/chosen": 0.010071685537695885,
52
+ "rewards/margins": 0.003802267834544182,
53
+ "rewards/rejected": 0.006269416771829128,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.5687203791469194,
58
+ "grad_norm": 15.853985724357454,
59
  "learning_rate": 4.5025027361734613e-07,
60
+ "logits/chosen": -14.269427299499512,
61
+ "logits/rejected": -13.808093070983887,
62
+ "logps/chosen": -412.9443359375,
63
+ "logps/rejected": -428.38494873046875,
64
+ "loss": 0.674,
65
+ "rewards/accuracies": 0.612500011920929,
66
+ "rewards/chosen": 0.04771440848708153,
67
+ "rewards/margins": 0.035354893654584885,
68
+ "rewards/rejected": 0.012359511107206345,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.7582938388625592,
73
+ "grad_norm": 14.687978809678542,
74
  "learning_rate": 3.893311157806091e-07,
75
+ "logits/chosen": -13.886492729187012,
76
+ "logits/rejected": -13.28197956085205,
77
+ "logps/chosen": -374.98211669921875,
78
+ "logps/rejected": -366.5968322753906,
79
+ "loss": 0.657,
80
+ "rewards/accuracies": 0.65625,
81
+ "rewards/chosen": 0.13442906737327576,
82
+ "rewards/margins": 0.07902240008115768,
83
+ "rewards/rejected": 0.05540664866566658,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.9478672985781991,
88
+ "grad_norm": 15.872142673244408,
89
  "learning_rate": 3.126631330646801e-07,
90
+ "logits/chosen": -14.917936325073242,
91
+ "logits/rejected": -14.90648078918457,
92
+ "logps/chosen": -429.6836853027344,
93
+ "logps/rejected": -480.3504943847656,
94
+ "loss": 0.6344,
95
+ "rewards/accuracies": 0.6875,
96
+ "rewards/chosen": 0.24091288447380066,
97
+ "rewards/margins": 0.1330389827489853,
98
+ "rewards/rejected": 0.10787389427423477,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 1.1374407582938388,
103
+ "grad_norm": 14.061428605486398,
104
  "learning_rate": 2.2891223348923882e-07,
105
+ "logits/chosen": -14.622962951660156,
106
+ "logits/rejected": -14.403157234191895,
107
+ "logps/chosen": -415.7464904785156,
108
+ "logps/rejected": -441.731201171875,
109
+ "loss": 0.6063,
110
+ "rewards/accuracies": 0.7437499761581421,
111
+ "rewards/chosen": 0.3395090103149414,
112
+ "rewards/margins": 0.22218124568462372,
113
+ "rewards/rejected": 0.11732780933380127,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 1.3270142180094786,
118
+ "grad_norm": 12.963152293888875,
119
  "learning_rate": 1.4754491880085317e-07,
120
+ "logits/chosen": -14.022384643554688,
121
+ "logits/rejected": -13.828951835632324,
122
+ "logps/chosen": -382.23468017578125,
123
+ "logps/rejected": -418.2818908691406,
124
+ "loss": 0.6011,
125
+ "rewards/accuracies": 0.706250011920929,
126
+ "rewards/chosen": 0.3396778702735901,
127
+ "rewards/margins": 0.22157195210456848,
128
+ "rewards/rejected": 0.118105947971344,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 1.5165876777251186,
133
+ "grad_norm": 12.394681314131397,
134
  "learning_rate": 7.775827023107834e-08,
135
+ "logits/chosen": -13.705121040344238,
136
+ "logits/rejected": -14.205709457397461,
137
+ "logps/chosen": -367.263427734375,
138
+ "logps/rejected": -423.30841064453125,
139
+ "loss": 0.5788,
140
+ "rewards/accuracies": 0.706250011920929,
141
+ "rewards/chosen": 0.36119210720062256,
142
+ "rewards/margins": 0.3365553319454193,
143
+ "rewards/rejected": 0.024636749178171158,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 1.7061611374407581,
148
+ "grad_norm": 14.456589635016153,
149
  "learning_rate": 2.7440387297912122e-08,
150
+ "logits/chosen": -13.98394775390625,
151
+ "logits/rejected": -14.161648750305176,
152
+ "logps/chosen": -399.45458984375,
153
+ "logps/rejected": -447.48828125,
154
+ "loss": 0.5766,
155
+ "rewards/accuracies": 0.78125,
156
+ "rewards/chosen": 0.3995341658592224,
157
+ "rewards/margins": 0.34082064032554626,
158
+ "rewards/rejected": 0.05871356278657913,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 1.8957345971563981,
163
+ "grad_norm": 13.44211674398592,
164
  "learning_rate": 2.27878296044029e-09,
165
+ "logits/chosen": -14.160197257995605,
166
+ "logits/rejected": -14.141824722290039,
167
+ "logps/chosen": -392.3072509765625,
168
+ "logps/rejected": -421.604248046875,
169
+ "loss": 0.5732,
170
+ "rewards/accuracies": 0.7250000238418579,
171
+ "rewards/chosen": 0.4029002785682678,
172
+ "rewards/margins": 0.27652695775032043,
173
+ "rewards/rejected": 0.1263733208179474,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 1.8957345971563981,
178
+ "eval_logits/chosen": -13.292621612548828,
179
+ "eval_logits/rejected": -12.66539478302002,
180
+ "eval_logps/chosen": -372.0066833496094,
181
+ "eval_logps/rejected": -373.4093933105469,
182
+ "eval_loss": 0.5971602201461792,
183
+ "eval_rewards/accuracies": 0.6770833134651184,
184
+ "eval_rewards/chosen": 0.3533553183078766,
185
+ "eval_rewards/margins": 0.24372106790542603,
186
+ "eval_rewards/rejected": 0.10963428020477295,
187
+ "eval_runtime": 20.0916,
188
+ "eval_samples_per_second": 37.329,
189
+ "eval_steps_per_second": 1.195,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 1.971563981042654,
194
  "step": 104,
195
  "total_flos": 0.0,
196
+ "train_loss": 0.627926590350958,
197
+ "train_runtime": 756.4701,
198
+ "train_samples_per_second": 17.846,
199
+ "train_steps_per_second": 0.137
200
  }
201
  ],
202
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:912545168b662aa28a59d2ea7ff1a88af69cd74e0b822da195076b8d4f0f07e5
3
  size 6264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c83a4cd67c56c86e8779774ef2b3c0c2d20d775dd7b0aa4eba03778d916c3903
3
  size 6264