Artples commited on
Commit
cfa119d
·
verified ·
1 Parent(s): f335543

Upload 2 files

Browse files
Files changed (2) hide show
  1. Kopie_von_⚡_AutoQuant.ipynb +355 -0
  2. LazyMergekit.ipynb +0 -0
Kopie_von_⚡_AutoQuant.ipynb ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "cellView": "form",
8
+ "id": "fD24jJxq7t3k"
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "# @title # ⚡ AutoQuant\n",
13
+ "\n",
14
+ "# @markdown > 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n",
15
+ "\n",
16
+ "# @markdown ❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n",
17
+ "\n",
18
+ "# @markdown **Usage:** Download the model by **running this cell** and then run the cells corresponding to your quantization methods of interest.\n",
19
+ "\n",
20
+ "# @markdown To quantize a 7B model, GGUF only needs a T4 GPU, while the other methods require an A100 GPU.\n",
21
+ "\n",
22
+ "# @markdown *See also the [AutoQuantize](https://colab.research.google.com/drive/1Li3USnl3yoYctqJLtYux3LAIy4Bnnv3J) notebook from zainulabideen.*\n",
23
+ "\n",
24
+ "# @markdown ---\n",
25
+ "\n",
26
+ "# @markdown ## 🤗 Download model (required)\n",
27
+ "# @markdown `HF_TOKEN` corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens) in Colab.\n",
28
+ "\n",
29
+ "MODEL_ID = \"mlabonne/Zebrafish-7B\" # @param {type:\"string\"}\n",
30
+ "USERNAME = \"Artples\" # @param {type:\"string\"}\n",
31
+ "HF_TOKEN = \"HF_TOKEN\" # @param {type:\"string\"}\n",
32
+ "\n",
33
+ "MODEL_NAME = MODEL_ID.split('/')[-1]\n",
34
+ "\n",
35
+ "# Download model\n",
36
+ "!git lfs install\n",
37
+ "!git clone https://huggingface.co/{MODEL_ID}\n",
38
+ "!pip install -q huggingface_hub\n",
39
+ "\n",
40
+ "from huggingface_hub import create_repo, HfApi, ModelCard\n",
41
+ "from google.colab import userdata, runtime\n",
42
+ "\n",
43
+ "# Defined in the secrets tab in Google Colab\n",
44
+ "hf_token = userdata.get(HF_TOKEN)\n",
45
+ "api = HfApi()"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "metadata": {
52
+ "id": "NL0yGhbe3EFk"
53
+ },
54
+ "outputs": [],
55
+ "source": [
56
+ "# @title ## 🧩 GGUF\n",
57
+ "\n",
58
+ "# @markdown Quantization methods: `q2_k`, `q3_k_l`, `q3_k_m`, `q3_k_s`, `q4_0`, `q4_1`, `q4_k_m`, `q4_k_s`, `q5_0`, `q5_1`, `q5_k_m`, `q5_k_s`, `q6_k`, `q8_0`\n",
59
+ "\n",
60
+ "# @markdown Learn more about GGUF and quantization methods in [this article](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html).\n",
61
+ "\n",
62
+ "QUANTIZATION_FORMAT = \"q5_k_m\" # @param {type:\"string\"}\n",
63
+ "QUANTIZATION_METHODS = QUANTIZATION_FORMAT.replace(\" \", \"\").split(\",\")\n",
64
+ "\n",
65
+ "# Install llama.cpp\n",
66
+ "!git clone https://github.com/ggerganov/llama.cpp\n",
67
+ "!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make\n",
68
+ "!pip install -r llama.cpp/requirements.txt\n",
69
+ "\n",
70
+ "# Convert to fp16\n",
71
+ "fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin\"\n",
72
+ "!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}\n",
73
+ "\n",
74
+ "# Quantize the model for each method in the QUANTIZATION_METHODS list\n",
75
+ "for method in QUANTIZATION_METHODS:\n",
76
+ " qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n",
77
+ " !./llama.cpp/quantize {fp16} {qtype} {method}\n",
78
+ "\n",
79
+ "# Create model card\n",
80
+ "card = ModelCard.load(MODEL_ID)\n",
81
+ "card.data.tags.append(\"autoquant\")\n",
82
+ "card.data.tags.append(\"gguf\")\n",
83
+ "card.save(f'{MODEL_NAME}/README.md')\n",
84
+ "\n",
85
+ "# Upload model\n",
86
+ "create_repo(\n",
87
+ " repo_id = f\"{USERNAME}/{MODEL_NAME}-GGUF\",\n",
88
+ " repo_type=\"model\",\n",
89
+ " exist_ok=True,\n",
90
+ " token=hf_token\n",
91
+ ")\n",
92
+ "api.upload_folder(\n",
93
+ " folder_path=MODEL_NAME,\n",
94
+ " repo_id=f\"{USERNAME}/{MODEL_NAME}-GGUF\",\n",
95
+ " allow_patterns=[\"*.gguf\",\"$.md\"],\n",
96
+ " token=hf_token\n",
97
+ ")"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {
104
+ "cellView": "form",
105
+ "id": "OE_R3AXG5Y-F"
106
+ },
107
+ "outputs": [],
108
+ "source": [
109
+ "# @title ## 🧠 GPTQ\n",
110
+ "\n",
111
+ "# @markdown Learn more about the GPTQ algorithm in [this article](https://mlabonne.github.io/blog/posts/4_bit_Quantization_with_GPTQ.html).\n",
112
+ "\n",
113
+ "# !pip install auto-gptq optimum accelerate\n",
114
+ "\n",
115
+ "# from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig\n",
116
+ "\n",
117
+ "# BITS = 4 # @param {type:\"integer\"}\n",
118
+ "# GROUP_SIZE = 128 # @param {type:\"integer\"}\n",
119
+ "# DAMP_PERCENT = 0.1 # @param {type:\"number\"}\n",
120
+ "\n",
121
+ "# # Quantize model\n",
122
+ "# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
123
+ "# quantization_config = GPTQConfig(bits=BITS, dataset=\"c4\", tokenizer=tokenizer, damp_percent=DAMP_PERCENT)\n",
124
+ "# model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map=\"auto\", quantization_config=quantization_config, low_cpu_mem_usage=True)\n",
125
+ "\n",
126
+ "# Save model and tokenizer\n",
127
+ "save_folder = MODEL_ID + \"-GPTQ\"\n",
128
+ "model.save_pretrained(save_folder, use_safetensors=True)\n",
129
+ "tokenizer.save_pretrained(save_folder)\n",
130
+ "\n",
131
+ "# Create model card\n",
132
+ "card = ModelCard.load(MODEL_ID)\n",
133
+ "card.data.tags.append(\"autoquant\")\n",
134
+ "card.data.tags.append(\"gptq\")\n",
135
+ "card.save(f'{save_folder}/README.md')\n",
136
+ "\n",
137
+ "# Upload model\n",
138
+ "create_repo(\n",
139
+ " repo_id = f\"{USERNAME}/{MODEL_NAME}-GPTQ\",\n",
140
+ " repo_type=\"model\",\n",
141
+ " exist_ok=True,\n",
142
+ " token=hf_token\n",
143
+ ")\n",
144
+ "api.upload_folder(\n",
145
+ " folder_path=save_folder,\n",
146
+ " repo_id=f\"{USERNAME}/{MODEL_NAME}-GPTQ\",\n",
147
+ " token=hf_token\n",
148
+ ")"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "metadata": {
155
+ "cellView": "form",
156
+ "id": "ZC9Nsr9u5WhN"
157
+ },
158
+ "outputs": [],
159
+ "source": [
160
+ "# @title # 🦙 ExLlamaV2\n",
161
+ "\n",
162
+ "# @markdown Learn more about ExLlamaV2 in [this article](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html).\n",
163
+ "\n",
164
+ "BPW = 5.0 # @param {type:\"number\"}\n",
165
+ "\n",
166
+ "# Install ExLLamaV2\n",
167
+ "!git clone https://github.com/turboderp/exllamav2\n",
168
+ "!pip install -e exllamav2\n",
169
+ "!cp {MODEL_NAME} base_model\n",
170
+ "!rm base_mode/*.bin\n",
171
+ "\n",
172
+ "# Download dataset\n",
173
+ "!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet\n",
174
+ "\n",
175
+ "# Quantize model\n",
176
+ "save_folder = MODEL_ID + \"-EXL2\"\n",
177
+ "!mkdir {save_folder}\n",
178
+ "!python exllamav2/convert.py \\\n",
179
+ " -i base_model \\\n",
180
+ " -o {save_folder} \\\n",
181
+ " -c wikitext-test.parquet \\\n",
182
+ " -b {BPW}\n",
183
+ "\n",
184
+ "# Copy files\n",
185
+ "!rm -rf quant/out_tensor\n",
186
+ "!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./{save_folder}/\n",
187
+ "\n",
188
+ "# Create model card\n",
189
+ "card = ModelCard.load(MODEL_ID)\n",
190
+ "card.data.tags.append(\"autoquant\")\n",
191
+ "card.data.tags.append(\"exl2\")\n",
192
+ "card.save(f'{save_folder}/README.md')\n",
193
+ "\n",
194
+ "# Upload model\n",
195
+ "create_repo(\n",
196
+ " repo_id = f\"{USERNAME}/{MODEL_NAME}-{BPW:.1f}bpw-exl2\",\n",
197
+ " repo_type=\"model\",\n",
198
+ " exist_ok=True,\n",
199
+ " token=hf_token\n",
200
+ ")\n",
201
+ "api.upload_folder(\n",
202
+ " folder_path=save_folder,\n",
203
+ " repo_id=f\"{USERNAME}/{MODEL_NAME}-{BPW:.1f}bpw-exl2\",\n",
204
+ " token=hf_token\n",
205
+ ")"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": null,
211
+ "metadata": {
212
+ "cellView": "form",
213
+ "id": "MyyUO2Fj3WHt"
214
+ },
215
+ "outputs": [],
216
+ "source": [
217
+ "# @title ## ⚖️ AWQ\n",
218
+ "\n",
219
+ "# @markdown See the [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ) for more information.\n",
220
+ "\n",
221
+ "# Install AutoAWQ\n",
222
+ "!pip install -qqq -U https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4+cu118-cp310-cp310-linux_x86_64.whl\n",
223
+ "!pip install zstandard\n",
224
+ "\n",
225
+ "from awq import AutoAWQForCausalLM\n",
226
+ "from transformers import AutoTokenizer\n",
227
+ "\n",
228
+ "BITS = 4 # @param {type: \"integer\"}\n",
229
+ "GROUP_SIZE = 128 # @param {type: \"integer\"}\n",
230
+ "VERSION = \"GEMM\" # @param {type: \"string\"}\n",
231
+ "ZERO_POINT = True # @param {type: \"boolean\"}\n",
232
+ "\n",
233
+ "quant_config = {\n",
234
+ " \"w_bit\": BITS,\n",
235
+ " \"q_group_size\": GROUP_SIZE,\n",
236
+ " \"version\": VERSION,\n",
237
+ " \"zero_point\": ZERO_POINT\n",
238
+ "}\n",
239
+ "save_folder = MODEL_NAME + \"-AWQ\"\n",
240
+ "\n",
241
+ "# Quantize model\n",
242
+ "model = AutoAWQForCausalLM.from_pretrained(MODEL_NAME, safetensors=True, low_cpu_mem_usage=True)\n",
243
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
244
+ "model.quantize(tokenizer, quant_config=quant_config)\n",
245
+ "\n",
246
+ "# Save model and tokenizer\n",
247
+ "model.save_quantized(save_folder)\n",
248
+ "tokenizer.save_pretrained(save_folder)\n",
249
+ "\n",
250
+ "# Create model card\n",
251
+ "card = ModelCard.load(MODEL_ID)\n",
252
+ "card.data.tags.append(\"autoquant\")\n",
253
+ "card.data.tags.append(\"awq\")\n",
254
+ "card.save(f'{save_folder}/README.md')\n",
255
+ "\n",
256
+ "# Upload model\n",
257
+ "create_repo(\n",
258
+ " repo_id = f\"{USERNAME}/{MODEL_NAME}-AWQ\",\n",
259
+ " repo_type=\"model\",\n",
260
+ " exist_ok=True,\n",
261
+ " token=hf_token\n",
262
+ ")\n",
263
+ "api.upload_folder(\n",
264
+ " folder_path=save_folder,\n",
265
+ " repo_id=f\"{USERNAME}/{MODEL_NAME}-AWQ\",\n",
266
+ " token=hf_token\n",
267
+ ")"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "metadata": {
274
+ "cellView": "form",
275
+ "id": "iEhLsUjcnNR7"
276
+ },
277
+ "outputs": [],
278
+ "source": [
279
+ "# @title ## 🐘 HQQ\n",
280
+ "\n",
281
+ "# @markdown See the official [HQQ repository](https://github.com/mobiusml/hqq) for more information.\n",
282
+ "\n",
283
+ "!git clone https://github.com/mobiusml/hqq.git\n",
284
+ "!pip install -e hqq\n",
285
+ "!python hqq/kernels/setup_cuda.py install\n",
286
+ "!pip install flash-attn --no-build-isolation\n",
287
+ "!pip install transformers --upgrade\n",
288
+ "!num_threads=8; OMP_NUM_THREADS=$num_threads CUDA_VISIBLE_DEVICES=0\n",
289
+ "\n",
290
+ "import torch\n",
291
+ "from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer\n",
292
+ "from hqq.models.hf.base import AutoHQQHFModel\n",
293
+ "from hqq.core.quantize import *\n",
294
+ "\n",
295
+ "BITS = 2 # @param {type:\"integer\"}\n",
296
+ "GROUP_SIZE = 128 # @param {type:\"integer\"}\n",
297
+ "\n",
298
+ "# Quant config\n",
299
+ "quant_config = BaseQuantizeConfig(\n",
300
+ " nbits=BITS,\n",
301
+ " group_size=GROUP_SIZE\n",
302
+ ")\n",
303
+ "\n",
304
+ "# Quantize model\n",
305
+ "model = HQQModelForCausalLM.from_pretrained(\n",
306
+ " MODEL_ID,\n",
307
+ " cache_dir=\".\",\n",
308
+ " attn_implementation=\"flash_attention_2\"\n",
309
+ ")\n",
310
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
311
+ "model.quantize_model(quant_config=quant_config, device='cuda')\n",
312
+ "\n",
313
+ "# Save model and tokenizer\n",
314
+ "save_folder = MODEL_ID + \"-HQQ\"\n",
315
+ "model.save_quantized(save_folder)\n",
316
+ "tokenizer.save_pretrained(save_folder)\n",
317
+ "\n",
318
+ "# Create model card\n",
319
+ "card = ModelCard.load(MODEL_ID)\n",
320
+ "card.data.tags.append(\"autoquant\")\n",
321
+ "card.data.tags.append(\"hqq\")\n",
322
+ "card.save(f'{save_folder}/README.md')\n",
323
+ "\n",
324
+ "# Upload model\n",
325
+ "create_repo(\n",
326
+ " repo_id = f\"{USERNAME}/{MODEL_NAME}-{BITS}bit-HQQ\",\n",
327
+ " repo_type=\"model\",\n",
328
+ " exist_ok=True,\n",
329
+ " token=hf_token\n",
330
+ ")\n",
331
+ "api.upload_folder(\n",
332
+ " folder_path=save_folder,\n",
333
+ " repo_id=f\"{USERNAME}/{MODEL_NAME}-{BITS}bit-HQQ\",\n",
334
+ " token=hf_token\n",
335
+ ")"
336
+ ]
337
+ }
338
+ ],
339
+ "metadata": {
340
+ "accelerator": "GPU",
341
+ "colab": {
342
+ "gpuType": "T4",
343
+ "provenance": []
344
+ },
345
+ "kernelspec": {
346
+ "display_name": "Python 3",
347
+ "name": "python3"
348
+ },
349
+ "language_info": {
350
+ "name": "python"
351
+ }
352
+ },
353
+ "nbformat": 4,
354
+ "nbformat_minor": 0
355
+ }
LazyMergekit.ipynb ADDED
The diff for this file is too large to render. See raw diff