thomasgauthier commited on
Commit
5e57ff4
1 Parent(s): fd211d9

Added expert extraction code

Browse files
Files changed (1) hide show
  1. README.md +106 -1
README.md CHANGED
@@ -35,4 +35,109 @@ The following named weight correspondance was used:
35
  | [**Unmixtraled-22B-v0.1-expert-5**](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-5) | **Mixtral 8x22B embed, attn, layernorm, lm_head + expert 5 MLPs** | **1099.32373046875** |
36
  | [Unmixtraled-22B-v0.1-expert-6](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-6) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 6 MLPs | 341.5309753417969 |
37
  | [Unmixtraled-22B-v0.1-expert-7](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-7) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 7 MLPs | 2099.63818359375 |
38
- | [Unmixtraled-22B-v0.1-lerp](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-lerp) | Mixtral 8x22B embed, attn, layernorm, lm_head + linear merge of expert 0-7 MLPs | 1873.9874267578125 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  | [**Unmixtraled-22B-v0.1-expert-5**](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-5) | **Mixtral 8x22B embed, attn, layernorm, lm_head + expert 5 MLPs** | **1099.32373046875** |
36
  | [Unmixtraled-22B-v0.1-expert-6](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-6) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 6 MLPs | 341.5309753417969 |
37
  | [Unmixtraled-22B-v0.1-expert-7](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-7) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 7 MLPs | 2099.63818359375 |
38
+ | [Unmixtraled-22B-v0.1-lerp](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-lerp) | Mixtral 8x22B embed, attn, layernorm, lm_head + linear merge of expert 0-7 MLPs | 1873.9874267578125 |
39
+
40
+ # Code
41
+
42
+ The following code was used to extract the experts and construct the dense models:
43
+
44
+ ```python
45
+ # pip install -U transformers huggingface_hub "git+https://github.com/arcee-ai/mergekit@7467108c05d56ef2bb4b8f33936d437dc448f7dd"
46
+
47
+ import fnmatch
48
+ import json
49
+ import os
50
+ import re
51
+ import shutil
52
+
53
+ import torch
54
+ from huggingface_hub import snapshot_download
55
+ from mergekit.architecture import get_architecture_info
56
+ from mergekit.common import ModelReference
57
+ from mergekit.io import LazyTensorLoader, TensorWriter
58
+ from tqdm import tqdm
59
+
60
+ MIXTRAL_MODEL_ID = "mistral-community/Mixtral-8x22B-v0.1"
61
+ MIXTRAL_PATH = snapshot_download(repo_id=MIXTRAL_MODEL_ID)
62
+ print(f"Mixtral downloaded to: {MIXTRAL_PATH}")
63
+
64
+ MISTRAL_PATH = snapshot_download(
65
+ repo_id="mistralai/Mistral-7B-v0.1", allow_patterns=["config.json"]
66
+ )
67
+ print(f"Mistral config downloaded to: {MISTRAL_PATH}")
68
+
69
+ with open(os.path.join(MISTRAL_PATH, "config.json"), "r") as f:
70
+ mistral_config = json.load(f)
71
+
72
+ with open(os.path.join(MIXTRAL_PATH, "config.json"), "r") as f:
73
+ mixtral_config = json.load(f)
74
+
75
+ combined_config = {
76
+ key: mixtral_config[key] for key in mistral_config if key in mixtral_config
77
+ }
78
+ combined_config["architectures"] = ["MistralForCausalLM"]
79
+ combined_config["model_type"] = "mistral"
80
+
81
+ mixtral_model_ref = ModelReference.parse(MIXTRAL_PATH)
82
+ mixtral_architecture_info = get_architecture_info(mixtral_model_ref.config())
83
+ mixtral_loader = LazyTensorLoader(mixtral_model_ref.tensor_index(), lazy_unpickle=True)
84
+
85
+ ALLOW_LIST = ["generation_config.json", "tokenizer.model", "tokenizer_config.json"]
86
+
87
+ def copy_directory(src, dest, allowed_patterns):
88
+ os.makedirs(dest, exist_ok=True)
89
+ for root, dirs, files in os.walk(src):
90
+ # Only keep directories that match at least one of the allowed patterns
91
+ dirs[:] = [d for d in dirs if any(fnmatch.fnmatch(d, pattern) for pattern in allowed_patterns)]
92
+ for file in files:
93
+ # Only copy files that match at least one of the allowed patterns
94
+ if any(fnmatch.fnmatch(file, pattern) for pattern in allowed_patterns):
95
+ src_path = os.path.join(root, file)
96
+ dest_path = os.path.join(dest, os.path.relpath(src_path, src))
97
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
98
+ shutil.copy2(src_path, dest_path)
99
+
100
+ def get_tensor(layer_num, expert_num, tensor_type):
101
+ weight_name = f"model.layers.{layer_num}.block_sparse_moe.experts.{expert_num}.{tensor_type}.weight"
102
+ return mixtral_loader.get_tensor(weight_name)
103
+
104
+
105
+ def extract_layer_number(string):
106
+ match = re.search(r"layers\.(\d+)\.", string)
107
+ return int(match.group(1)) if match else None
108
+
109
+
110
+ def save_expert_as_dense(output_path, expert_num):
111
+ dense_model_ref = ModelReference.parse(output_path)
112
+ dense_architecture_info = get_architecture_info(dense_model_ref.config())
113
+
114
+ writer = TensorWriter(output_path, safe_serialization=True)
115
+
116
+ for weight_info in tqdm(dense_architecture_info.all_weights(dense_model_ref.config())):
117
+ if weight_info.name.endswith(".up_proj.weight"):
118
+ layer_num = extract_layer_number(weight_info.name)
119
+ writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w3"))
120
+ elif weight_info.name.endswith(".down_proj.weight"):
121
+ layer_num = extract_layer_number(weight_info.name)
122
+ writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w2"))
123
+ elif weight_info.name.endswith(".gate_proj.weight"):
124
+ layer_num = extract_layer_number(weight_info.name)
125
+ writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w1"))
126
+ else:
127
+ writer.save_tensor(weight_info.name, mixtral_loader.get_tensor(weight_info.name))
128
+
129
+ writer.finalize()
130
+
131
+
132
+ num_experts = mixtral_config["num_local_experts"]
133
+
134
+ for expert_num in range(num_experts):
135
+ dense_path = f"./dense_expert_{expert_num}"
136
+ copy_directory(MIXTRAL_PATH, dense_path, ALLOW_LIST)
137
+
138
+ with open(os.path.join(dense_path, "config.json"), "w") as f:
139
+ json.dump(combined_config, f, indent=2)
140
+
141
+ save_expert_as_dense(dense_path, expert_num)
142
+ print(f"Dense model #{expert_num} saved to {os.path.abspath(dense_path)}")
143
+ ```