jtatman commited on
Commit
fd9927d
·
verified ·
1 Parent(s): 8471b41

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -5,18 +5,37 @@ tags:
5
  - lazymergekit
6
  - SciPhi/SciPhi-Mistral-7B-32k
7
  - SciPhi/SciPhi-Mistral-7B-32k
 
 
 
 
 
 
 
8
  base_model:
9
  - SciPhi/SciPhi-Mistral-7B-32k
10
  - SciPhi/SciPhi-Mistral-7B-32k
 
 
 
 
 
 
 
11
  ---
12
 
13
  # SciPhi-Mistral-7B-32k-sliced
14
 
15
- This is purely an experiment in sliced layer extraction to find active layers.
16
-
17
  SciPhi-Mistral-7B-32k-sliced is a merge of the following models using [LazyMergekit](https://colab.research.google.com/drive/1obulZ1ROXHjYLn6PPZJwRR6GzgQogxxb?usp=sharing):
18
  * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
19
  * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
 
 
 
 
 
 
 
20
 
21
  ## 🧩 Configuration
22
 
@@ -24,20 +43,34 @@ SciPhi-Mistral-7B-32k-sliced is a merge of the following models using [LazyMerge
24
  slices:
25
  - sources:
26
  - model: SciPhi/SciPhi-Mistral-7B-32k
27
- layer_range: [0, 6]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  - model: SciPhi/SciPhi-Mistral-7B-32k
29
- layer_range: [26, 32]
30
 
31
- merge_method: slerp
32
- base_model: SciPhi/SciPhi-Mistral-7B-32k
33
 
34
- parameters:
35
- t:
36
- - filter: self_attn
37
- value: [0, 0.5, 0.3, 0.7, 1]
38
- - filter: mlp
39
- value: [1, 0.5, 0.7, 0.3, 0]
40
- - value: 0.5 # fallback for rest of tensors
41
  tokenizer_source: union
42
 
43
  dtype: float16
 
5
  - lazymergekit
6
  - SciPhi/SciPhi-Mistral-7B-32k
7
  - SciPhi/SciPhi-Mistral-7B-32k
8
+ - SciPhi/SciPhi-Mistral-7B-32k
9
+ - SciPhi/SciPhi-Mistral-7B-32k
10
+ - SciPhi/SciPhi-Mistral-7B-32k
11
+ - SciPhi/SciPhi-Mistral-7B-32k
12
+ - SciPhi/SciPhi-Mistral-7B-32k
13
+ - SciPhi/SciPhi-Mistral-7B-32k
14
+ - SciPhi/SciPhi-Mistral-7B-32k
15
  base_model:
16
  - SciPhi/SciPhi-Mistral-7B-32k
17
  - SciPhi/SciPhi-Mistral-7B-32k
18
+ - SciPhi/SciPhi-Mistral-7B-32k
19
+ - SciPhi/SciPhi-Mistral-7B-32k
20
+ - SciPhi/SciPhi-Mistral-7B-32k
21
+ - SciPhi/SciPhi-Mistral-7B-32k
22
+ - SciPhi/SciPhi-Mistral-7B-32k
23
+ - SciPhi/SciPhi-Mistral-7B-32k
24
+ - SciPhi/SciPhi-Mistral-7B-32k
25
  ---
26
 
27
  # SciPhi-Mistral-7B-32k-sliced
28
 
 
 
29
  SciPhi-Mistral-7B-32k-sliced is a merge of the following models using [LazyMergekit](https://colab.research.google.com/drive/1obulZ1ROXHjYLn6PPZJwRR6GzgQogxxb?usp=sharing):
30
  * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
31
  * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
32
+ * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
33
+ * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
34
+ * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
35
+ * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
36
+ * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
37
+ * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
38
+ * [SciPhi/SciPhi-Mistral-7B-32k](https://huggingface.co/SciPhi/SciPhi-Mistral-7B-32k)
39
 
40
  ## 🧩 Configuration
41
 
 
43
  slices:
44
  - sources:
45
  - model: SciPhi/SciPhi-Mistral-7B-32k
46
+ layer_range: [3, 3]
47
+ - sources:
48
+ - model: SciPhi/SciPhi-Mistral-7B-32k
49
+ layer_range: [5, 5]
50
+ - sources:
51
+ - model: SciPhi/SciPhi-Mistral-7B-32k
52
+ layer_range: [6, 6]
53
+ - sources:
54
+ - model: SciPhi/SciPhi-Mistral-7B-32k
55
+ layer_range: [10, 10]
56
+ - sources:
57
+ - model: SciPhi/SciPhi-Mistral-7B-32k
58
+ layer_range: [17, 17]
59
+ - sources:
60
+ - model: SciPhi/SciPhi-Mistral-7B-32k
61
+ layer_range: [18, 18]
62
+ - sources:
63
+ - model: SciPhi/SciPhi-Mistral-7B-32k
64
+ layer_range: [19, 19]
65
+ - sources:
66
+ - model: SciPhi/SciPhi-Mistral-7B-32k
67
+ layer_range: [20, 20]
68
+ - sources:
69
  - model: SciPhi/SciPhi-Mistral-7B-32k
70
+ layer_range: [23, 23]
71
 
 
 
72
 
73
+ merge_method: passthrough
 
 
 
 
 
 
74
  tokenizer_source: union
75
 
76
  dtype: float16
config.json CHANGED
@@ -3,6 +3,7 @@
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
6
  "bos_token_id": 1,
7
  "eos_token_id": 2,
8
  "hidden_act": "silu",
@@ -12,14 +13,14 @@
12
  "max_position_embeddings": 32768,
13
  "model_type": "mistral",
14
  "num_attention_heads": 32,
15
- "num_hidden_layers": 6,
16
  "num_key_value_heads": 8,
17
  "rms_norm_eps": 1e-05,
18
  "rope_theta": 10000.0,
19
  "sliding_window": 4096,
20
  "tie_word_embeddings": false,
21
  "torch_dtype": "float16",
22
- "transformers_version": "4.35.2",
23
  "use_cache": false,
24
  "vocab_size": 32000
25
  }
 
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
6
+ "attention_dropout": 0.0,
7
  "bos_token_id": 1,
8
  "eos_token_id": 2,
9
  "hidden_act": "silu",
 
13
  "max_position_embeddings": 32768,
14
  "model_type": "mistral",
15
  "num_attention_heads": 32,
16
+ "num_hidden_layers": 0,
17
  "num_key_value_heads": 8,
18
  "rms_norm_eps": 1e-05,
19
  "rope_theta": 10000.0,
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "float16",
23
+ "transformers_version": "4.37.2",
24
  "use_cache": false,
25
  "vocab_size": 32000
26
  }
mergekit_config.yml CHANGED
@@ -2,20 +2,34 @@
2
  slices:
3
  - sources:
4
  - model: SciPhi/SciPhi-Mistral-7B-32k
5
- layer_range: [0, 6]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  - model: SciPhi/SciPhi-Mistral-7B-32k
7
- layer_range: [26, 32]
8
 
9
- merge_method: slerp
10
- base_model: SciPhi/SciPhi-Mistral-7B-32k
11
 
12
- parameters:
13
- t:
14
- - filter: self_attn
15
- value: [0, 0.5, 0.3, 0.7, 1]
16
- - filter: mlp
17
- value: [1, 0.5, 0.7, 0.3, 0]
18
- - value: 0.5 # fallback for rest of tensors
19
  tokenizer_source: union
20
 
21
  dtype: float16
 
2
  slices:
3
  - sources:
4
  - model: SciPhi/SciPhi-Mistral-7B-32k
5
+ layer_range: [3, 3]
6
+ - sources:
7
+ - model: SciPhi/SciPhi-Mistral-7B-32k
8
+ layer_range: [5, 5]
9
+ - sources:
10
+ - model: SciPhi/SciPhi-Mistral-7B-32k
11
+ layer_range: [6, 6]
12
+ - sources:
13
+ - model: SciPhi/SciPhi-Mistral-7B-32k
14
+ layer_range: [10, 10]
15
+ - sources:
16
+ - model: SciPhi/SciPhi-Mistral-7B-32k
17
+ layer_range: [17, 17]
18
+ - sources:
19
+ - model: SciPhi/SciPhi-Mistral-7B-32k
20
+ layer_range: [18, 18]
21
+ - sources:
22
+ - model: SciPhi/SciPhi-Mistral-7B-32k
23
+ layer_range: [19, 19]
24
+ - sources:
25
+ - model: SciPhi/SciPhi-Mistral-7B-32k
26
+ layer_range: [20, 20]
27
+ - sources:
28
  - model: SciPhi/SciPhi-Mistral-7B-32k
29
+ layer_range: [23, 23]
30
 
 
 
31
 
32
+ merge_method: passthrough
 
 
 
 
 
 
33
  tokenizer_source: union
34
 
35
  dtype: float16
model-00001-of-00001.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37eb374525dcbbd712ce3958074be99256e72e846ecfc9b6d7b90d4e2af919bd
3
+ size 524296512
model.safetensors.index.json CHANGED
@@ -1 +1 @@
1
- {"metadata": {"mergekit_version": "0.0.4.1"}, "weight_map": {"model.embed_tokens.weight": "model-00001-of-00002.safetensors", "lm_head.weight": "model-00001-of-00002.safetensors", "model.norm.weight": "model-00001-of-00002.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.2.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.1.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.0.input_layernorm.weight": "model-00002-of-00002.safetensors"}}
 
1
+ {"metadata": {"mergekit_version": "0.0.4.1"}, "weight_map": {"model.embed_tokens.weight": "model-00001-of-00001.safetensors", "lm_head.weight": "model-00001-of-00001.safetensors", "model.norm.weight": "model-00001-of-00001.safetensors"}}
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",