File size: 2,040 Bytes
ee98774
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
---
base_model:
- Qwen/Qwen2.5-Coder-14B-Instruct
---

```python
#!/usr/bin/env python3
import time
from vllm import LLM, SamplingParams

def main():
    # Hard-coded model and tensor parallel configuration.
    model_path = "miike-ai/qwen-14b-coder-fp8"
    tensor_parallel_size = 1

    # Define sampling parameters with an increased max_tokens and a stop string.
    sampling_params = SamplingParams(
        temperature=0.0,
        top_p=0.95,
        max_tokens=32000,       # Increase this to allow longer responses.
        stop=["\nUser:"],     # Stop when the model outputs a new user marker.
    )

    print(f"Loading model '{model_path}' ...")
    model = LLM(
        model=model_path,
        enforce_eager=True,
        dtype="auto",
        tensor_parallel_size=tensor_parallel_size,
    )
    print("Model loaded. You can now chat!")
    print("Type 'exit' or 'quit' to end the conversation.\n")

    conversation = ""
    while True:
        try:
            user_input = input("User: ").strip()
        except (KeyboardInterrupt, EOFError):
            print("\nExiting chat.")
            break

        if user_input.lower() in {"exit", "quit"}:
            print("Exiting chat.")
            break

        # Append the user's input to the conversation history.
        conversation += f"User: {user_input}\nBot: "
        print("Bot: ", end="", flush=True)

        # Generate a response using the conversation history and sampling parameters.
        response = model.generate(conversation, sampling_params=sampling_params)
        # Extract the generated reply.
        bot_reply = response[0].outputs[0].text.strip()

        # Simulate streaming by printing one character at a time.
        for char in bot_reply:
            print(char, end="", flush=True)
            time.sleep(0.02)  # Adjust delay (in seconds) as desired.
        print()  # Newline after bot reply.

        # Append the bot reply to conversation history.
        conversation += bot_reply + "\n"

if __name__ == "__main__":
    main()
```