|
``` |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig |
|
import torch |
|
|
|
model_path = 'efficient-llm/vicuna-13b-v1.3-wanda' |
|
config = AutoConfig.from_pretrained(model_path, revision='0.5_2to4', trust_remote_code=True) |
|
enc = AutoTokenizer.from_pretrained('lmsys/vicuna-13b-v1.3', trust_remote_code=True) |
|
kwargs = {"torch_dtype": torch.float16, "low_cpu_mem_usage": True} |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_path, config=config, trust_remote_code=True, device_map='auto', revision='0.5_2to4', **kwargs) |
|
|
|
model.eval() |
|
input_ids = enc('How are you today?', return_tensors='pt').input_ids.to('cuda') |
|
outputs = model.generate(input_ids=input_ids, max_length=128) |
|
print(enc.decode(outputs[0])) |
|
``` |
|
|