|
--- |
|
license: mit |
|
--- |
|
|
|
Llama 2 7B quantized in 2-bit with GPTQ. |
|
|
|
``` |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from optimum.gptq import GPTQQuantizer |
|
import torch |
|
w = 2 |
|
model_path = meta-llama/Llama-2-7b-hf |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) |
|
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16) |
|
quantizer = GPTQQuantizer(bits=w, dataset="c4", model_seqlen = 4096) |
|
quantized_model = quantizer.quantize_model(model, tokenizer) |
|
``` |