Summary
A 4-bits quantization of scb10x/typhoon-7b with only less than 8 GB VRAM is required.
Steps to reproduce
# init parameters
model_name: str = 'scb10x/typhoon-7b'
quantization_mode: str = 'q4-bnb_cuda' # possible values = {'q4-bnb_cuda', 'q8-bnb_cuda', 'q4-torch_ptdq', 'q8-torch_ptdq'}
# load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id
print(tokenizer) # LlamaTokenizerFast
# load model
import torch
from transformers import AutoModelForCausalLM
if quantization_mode == 'q4-bnb_cuda': # ampere architecture with 8gb vram + cpu with 20gb is recommended
print('4-bits bitsandbytes quantization with cuda')
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit = True,
device_map = 'auto',
torch_dtype = torch.bfloat16)
elif quantization_mode == 'q8-bnb_cuda': # ampere architecture with 12gb vram + cpu with 20gb is recommended
print('8-bits bitsandbytes quantization with cuda')
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_8bit = True,
device_map = 'auto',
torch_dtype = torch.bfloat16)
elif quantization_mode == 'q4-torch_ptdq': # cpu with 64gb++ ram is recommended
print('4-bits x2 post training dynamic quantization')
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype = torch.float32)
model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint4x2)
elif quantization_mode == 'q8-torch_ptdq': # cpu with 64gb++ ram is recommended
print('8-bits post training dynamic quantization')
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype = torch.float32)
model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint8)
else:
print('default model')
model = AutoModelForCausalLM.from_pretrained(model_name)
print(model) # MistralForCausalLM
# text generator
from transformers import GenerationConfig, TextGenerationPipeline
config = GenerationConfig.from_pretrained(model_name)
config.num_return_sequences: int = 1
config.do_sample: bool = True
config.max_new_tokens: int = 128
config.temperature: float = 0.7
config.top_p: float = 0.95
config.repetition_penalty: float = 1.3
generator = TextGenerationPipeline(
model = model,
tokenizer = tokenizer,
return_full_text = True,
generation_config = config)
# sample
sample: str = 'ความหมายของชีวิตคืออะไร?\n'
output = generator(sample, pad_token_id = tokenizer.eos_token_id)
print(output[0]['generated_text'])
requirement.txt
torch==2.1.2
accelerate==0.25.0
bitsandbytes==0.41.3
#transformers==4.37.0.dev0
transformers @ git+https://github.com/huggingface/transformers
- Downloads last month
- 20
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.