bad
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "ChenMnZ/Llama-3-8b-EfficientQAT-w2g128-GPTQ"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt = "Give me a short introduction to large language model."
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=51,
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
/usr/local/lib/python3.11/dist-packages/huggingface_hub/file_download.py:795: FutureWarning: resume_download
is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True
.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
config.json: 100%
1.31k/1.31k [00:00<00:00, 25.6kB/s]
ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for compatibililty.
/usr/local/lib/python3.11/dist-packages/huggingface_hub/file_download.py:795: FutureWarning: resume_download
is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True
.
warnings.warn(
model.safetensors: 100%
3.97G/3.97G [01:34<00:00, 43.5MB/s]
INFO - Auto pick kernel based on compatibility: <class 'gptqmodel.nn_modules.qlinear.torch.TorchQuantLinear'>
/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py:4371: FutureWarning: _is_quantized_training_enabled
is going to be deprecated in transformers 4.39.0. Please use model.hf_quantizer.is_trainable
instead
warnings.warn(
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the LlamaAttention
class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the LlamaAttention
class
tokenizer_config.json: 100%
50.6k/50.6k [00:00<00:00, 1.45MB/s]
tokenizer.json: 100%
9.08M/9.08M [00:00<00:00, 12.5MB/s]
special_tokens_map.json: 100%
301/301 [00:00<00:00, 25.4kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set tokenizer.chat_template
to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
from transformers import AutoTokenizer
from gptqmodel import GPTQModel
quant_dir = "ChenMnZ/Llama-3-8b-EfficientQAT-w2g128-GPTQ"
quant_dir = "ChenMnZ/Llama-2-7b-EfficientQAT-w2g128-BitBLAS"
or local path
tokenizer = AutoTokenizer.from_pretrained(quant_dir, use_fast=True)
load quantized model to the first GPU
model = GPTQModel.from_quantized(quant_dir)
inference with model.generate
print(tokenizer.decode(model.generate(**tokenizer("Model quantization is", return_tensors="pt").to(model.device))[0]))
ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for compatibililty.
/usr/local/lib/python3.11/dist-packages/huggingface_hub/file_download.py:795: FutureWarning: resume_download
is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True
.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Fetching 8 files: 100%
8/8 [00:00<00:00, 3.22it/s]
README.md: 100%
8.72k/8.72k [00:00<00:00, 117kB/s]
quantize_config.json: 100%
459/459 [00:00<00:00, 10.6kB/s]
.gitattributes: 100%
1.52k/1.52k [00:00<00:00, 20.6kB/s]
INFO - Ignoring unknown parameter in the quantization configuration: model_name_or_path.
INFO - Ignoring unknown parameter in the quantization configuration: model_file_base_name.
INFO - Estimated Quantization BPW (bits per weight): 2.171875 bpw, based on [bits: 2, group_size: 128]
INFO - Auto enabling flash attention2
INFO - Auto pick kernel based on compatibility: <class 'gptqmodel.nn_modules.qlinear.dynamic_cuda.DynamicCudaQuantLinear'>
INFO - make_quant: Linear candidates: [<class 'gptqmodel.nn_modules.qlinear.dynamic_cuda.DynamicCudaQuantLinear'>, <class 'gptqmodel.nn_modules.qlinear.torch.TorchQuantLinear'>]
INFO - make_quant: Selected linear: <class 'gptqmodel.nn_modules.qlinear.dynamic_cuda.DynamicCudaQuantLinear'>
.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py:1141: UserWarning: Using the model-agnostic default max_length
(=20) to control the generation length. We recommend setting max_new_tokens
to control the maximum length of the generation.
warnings.warn(
RuntimeError Traceback (most recent call last)
in <cell line: 0>()
13
14 # inference with model.generate
---> 15 print(tokenizer.decode(model.generate(**tokenizer("Model quantization is", return_tensors="pt").to(model.device))[0]))
18 frames
/usr/local/lib/python3.11/dist-packages/gptqmodel/nn_modules/qlinear/dynamic_cuda.py in forward(self, x)
129 )
130
--> 131 out = out.to(x.dtype).reshape(out_shape)
132 if self.bias is not None:
133 out.add_(self.bias)
RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
colab t4
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "ChenMnZ/Llama-3-8b-EfficientQAT-w2g128-GPTQ"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt = "Give me a short introduction to large language model."
formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>"
model_inputs = tokenizer([formatted_prompt], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=51,
)
response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(response)
/usr/local/lib/python3.11/dist-packages/huggingface_hub/file_download.py:795: FutureWarning: resume_download
is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True
.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for compatibililty.
INFO - Auto pick kernel based on compatibility: <class 'gptqmodel.nn_modules.qlinear.torch.TorchQuantLinear'>
/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py:4371: FutureWarning: _is_quantized_training_enabled
is going to be deprecated in transformers 4.39.0. Please use model.hf_quantizer.is_trainable
instead
warnings.warn(
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the LlamaAttention
class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the LlamaAttention
class
/usr/local/lib/python3.11/dist-packages/huggingface_hub/file_download.py:795: FutureWarning: resume_download
is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True
.
warnings.warn(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
<|user|>
Give me a short introduction to large language model.
<|assistant|>!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!