File size: 2,040 Bytes
ee98774 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
---
base_model:
- Qwen/Qwen2.5-Coder-14B-Instruct
---
```python
#!/usr/bin/env python3
import time
from vllm import LLM, SamplingParams
def main():
# Hard-coded model and tensor parallel configuration.
model_path = "miike-ai/qwen-14b-coder-fp8"
tensor_parallel_size = 1
# Define sampling parameters with an increased max_tokens and a stop string.
sampling_params = SamplingParams(
temperature=0.0,
top_p=0.95,
max_tokens=32000, # Increase this to allow longer responses.
stop=["\nUser:"], # Stop when the model outputs a new user marker.
)
print(f"Loading model '{model_path}' ...")
model = LLM(
model=model_path,
enforce_eager=True,
dtype="auto",
tensor_parallel_size=tensor_parallel_size,
)
print("Model loaded. You can now chat!")
print("Type 'exit' or 'quit' to end the conversation.\n")
conversation = ""
while True:
try:
user_input = input("User: ").strip()
except (KeyboardInterrupt, EOFError):
print("\nExiting chat.")
break
if user_input.lower() in {"exit", "quit"}:
print("Exiting chat.")
break
# Append the user's input to the conversation history.
conversation += f"User: {user_input}\nBot: "
print("Bot: ", end="", flush=True)
# Generate a response using the conversation history and sampling parameters.
response = model.generate(conversation, sampling_params=sampling_params)
# Extract the generated reply.
bot_reply = response[0].outputs[0].text.strip()
# Simulate streaming by printing one character at a time.
for char in bot_reply:
print(char, end="", flush=True)
time.sleep(0.02) # Adjust delay (in seconds) as desired.
print() # Newline after bot reply.
# Append the bot reply to conversation history.
conversation += bot_reply + "\n"
if __name__ == "__main__":
main()
``` |