Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
from auto_gptq import AutoGPTQForCausalLM
|
4 |
+
model_path = 'efficient-llm/llama-2-13b-chat-gptq'
|
5 |
+
tokenizer_path = 'meta-llama/Llama-2-7b-hf'
|
6 |
+
model = AutoGPTQForCausalLM.from_quantized(
|
7 |
+
model_path,
|
8 |
+
# inject_fused_attention=False, # or
|
9 |
+
disable_exllama=True,
|
10 |
+
device_map='auto',
|
11 |
+
revision='3bit_128g',
|
12 |
+
)
|
13 |
+
from transformers import AutoTokenizer
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
|
15 |
+
input_ids = tokenizer('How are you?', return_tensors='pt').input_ids.to('cuda')
|
16 |
+
outputs = model.generate(input_ids=input_ids, max_length=128)
|
17 |
+
print(tokenizer.decode(outputs[0]))
|