In [1]:
# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp310-cp310-linux_x86_64.whl

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from awq import AutoAWQForCausalLM
from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM
import torch

model_path = 'mesolitica/malaysian-mistral-7b-32k-instructions-v4'

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
!rm -rf test2

In [6]:
model.save_pretrained('./test2', safe_serialization = False)

In [7]:
model = AutoAWQForCausalLM.from_pretrained('./test2')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
quant_path = 'malaysian-mistral-7b-32k-instructions-v4-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')

Downloading data:   0%|          | 0.00/470M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

AWQ: 100%|██████████| 32/32 [09:49<00:00, 18.41s/it]


In [9]:
model.save_quantized(quant_path, safetensors = False)
tokenizer.save_pretrained(quant_path)



('malaysian-mistral-7b-32k-instructions-v4-awq/tokenizer_config.json',
 'malaysian-mistral-7b-32k-instructions-v4-awq/special_tokens_map.json',
 'malaysian-mistral-7b-32k-instructions-v4-awq/tokenizer.json')

In [10]:
tokenizer.push_to_hub('mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ')

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ/commit/cba1704e3977bd29352015ee3b4c2a26efa17666', commit_message='Upload tokenizer', commit_description='', oid='cba1704e3977bd29352015ee3b4c2a26efa17666', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
quantization_config = AwqConfig(
    bits=quant_config['w_bit'],
    group_size=quant_config['q_group_size'],
    zero_point=quant_config['zero_point'],
    backend='autoawq',
    version=quant_config['version'].lower(),
)

config = AutoConfig.from_pretrained(model_path)
config.quantization_config = quantization_config

config.push_to_hub('mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ')

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ/commit/e72805b82ace8600987f5bae5e336b336d9cd7d0', commit_message='Upload config', commit_description='', oid='e72805b82ace8600987f5bae5e336b336d9cd7d0', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
!ls malaysian-mistral-7b-32k-instructions-v4-awq

config.json		quant_config.json	 tokenizer_config.json
generation_config.json	special_tokens_map.json
pytorch_model.bin	tokenizer.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
from huggingface_hub import HfApi

api = HfApi()

In [14]:
api.upload_file(
    path_or_fileobj='malaysian-mistral-7b-32k-instructions-v4-awq/pytorch_model.bin',
    path_in_repo="pytorch_model.bin",
    repo_id='mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ',
    repo_type="model",
)

pytorch_model.bin:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ/commit/be94bbf12947f371acbc8ebb374f1340f3a308cd', commit_message='Upload pytorch_model.bin with huggingface_hub', commit_description='', oid='be94bbf12947f371acbc8ebb374f1340f3a308cd', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
api.upload_file(
    path_or_fileobj='malaysian-mistral-7b-32k-instructions-v4-awq/quant_config.json',
    path_in_repo="quant_config.json",
    repo_id='mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ',
    repo_type="model",
)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ/commit/7235673cad93ca30015d0b1d66f96f658af70f2c', commit_message='Upload quant_config.json with huggingface_hub', commit_description='', oid='7235673cad93ca30015d0b1d66f96f658af70f2c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-mistral-7b-32k-instructions-v4-AWQ')
_ = quantized_model.cuda()

You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.


pytorch_model.bin:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

In [None]:
messages = [
    {'role': 'user', 'content': 'KWSP tu apa'}
]
prompt = tokenizer.apply_chat_template(messages, tokenize = False)
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')

In [None]:
%%time

generate_kwargs = dict(
    inputs,
    max_new_tokens=100,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)
r = quantized_model.generate(**generate_kwargs)
tokenizer.decode(r[0])