Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer | |
from vllm import LLM, SamplingParams | |
# Initialize the tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") | |
# Pass the default decoding hyperparameters of Qwen2.5-7B-Instruct | |
# max_tokens is for the maximum length for generation. | |
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512) | |
# Input the model name or path. Can be GPTQ or AWQ models. | |
llm = LLM(model="Qwen/Qwen2.5-7B-Instruct") | |
# Prepare your prompts | |
prompt = "Tell me something about large language models." | |
messages = [ | |
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, | |
{"role": "user", "content": prompt} | |
] | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
# generate outputs | |
outputs = llm.generate([text], sampling_params) | |
# Print the outputs. | |
for output in outputs: | |
prompt = output.prompt | |
generated_text = output.outputs[0].text | |
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") |