|
--- |
|
base_model: |
|
- Qwen/Qwen2.5-Coder-14B-Instruct |
|
--- |
|
|
|
```python |
|
#!/usr/bin/env python3 |
|
import time |
|
from vllm import LLM, SamplingParams |
|
|
|
def main(): |
|
# Hard-coded model and tensor parallel configuration. |
|
model_path = "miike-ai/qwen-14b-coder-fp8" |
|
tensor_parallel_size = 1 |
|
|
|
# Define sampling parameters with an increased max_tokens and a stop string. |
|
sampling_params = SamplingParams( |
|
temperature=0.0, |
|
top_p=0.95, |
|
max_tokens=32000, # Increase this to allow longer responses. |
|
stop=["\nUser:"], # Stop when the model outputs a new user marker. |
|
) |
|
|
|
print(f"Loading model '{model_path}' ...") |
|
model = LLM( |
|
model=model_path, |
|
enforce_eager=True, |
|
dtype="auto", |
|
tensor_parallel_size=tensor_parallel_size, |
|
) |
|
print("Model loaded. You can now chat!") |
|
print("Type 'exit' or 'quit' to end the conversation.\n") |
|
|
|
conversation = "" |
|
while True: |
|
try: |
|
user_input = input("User: ").strip() |
|
except (KeyboardInterrupt, EOFError): |
|
print("\nExiting chat.") |
|
break |
|
|
|
if user_input.lower() in {"exit", "quit"}: |
|
print("Exiting chat.") |
|
break |
|
|
|
# Append the user's input to the conversation history. |
|
conversation += f"User: {user_input}\nBot: " |
|
print("Bot: ", end="", flush=True) |
|
|
|
# Generate a response using the conversation history and sampling parameters. |
|
response = model.generate(conversation, sampling_params=sampling_params) |
|
# Extract the generated reply. |
|
bot_reply = response[0].outputs[0].text.strip() |
|
|
|
# Simulate streaming by printing one character at a time. |
|
for char in bot_reply: |
|
print(char, end="", flush=True) |
|
time.sleep(0.02) # Adjust delay (in seconds) as desired. |
|
print() # Newline after bot reply. |
|
|
|
# Append the bot reply to conversation history. |
|
conversation += bot_reply + "\n" |
|
|
|
if __name__ == "__main__": |
|
main() |
|
``` |