miike-ai commited on
Commit
ee98774
·
verified ·
1 Parent(s): fe501b0

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +66 -0
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - Qwen/Qwen2.5-Coder-14B-Instruct
4
+ ---
5
+
6
+ ```python
7
+ #!/usr/bin/env python3
8
+ import time
9
+ from vllm import LLM, SamplingParams
10
+
11
+ def main():
12
+ # Hard-coded model and tensor parallel configuration.
13
+ model_path = "miike-ai/qwen-14b-coder-fp8"
14
+ tensor_parallel_size = 1
15
+
16
+ # Define sampling parameters with an increased max_tokens and a stop string.
17
+ sampling_params = SamplingParams(
18
+ temperature=0.0,
19
+ top_p=0.95,
20
+ max_tokens=32000, # Increase this to allow longer responses.
21
+ stop=["\nUser:"], # Stop when the model outputs a new user marker.
22
+ )
23
+
24
+ print(f"Loading model '{model_path}' ...")
25
+ model = LLM(
26
+ model=model_path,
27
+ enforce_eager=True,
28
+ dtype="auto",
29
+ tensor_parallel_size=tensor_parallel_size,
30
+ )
31
+ print("Model loaded. You can now chat!")
32
+ print("Type 'exit' or 'quit' to end the conversation.\n")
33
+
34
+ conversation = ""
35
+ while True:
36
+ try:
37
+ user_input = input("User: ").strip()
38
+ except (KeyboardInterrupt, EOFError):
39
+ print("\nExiting chat.")
40
+ break
41
+
42
+ if user_input.lower() in {"exit", "quit"}:
43
+ print("Exiting chat.")
44
+ break
45
+
46
+ # Append the user's input to the conversation history.
47
+ conversation += f"User: {user_input}\nBot: "
48
+ print("Bot: ", end="", flush=True)
49
+
50
+ # Generate a response using the conversation history and sampling parameters.
51
+ response = model.generate(conversation, sampling_params=sampling_params)
52
+ # Extract the generated reply.
53
+ bot_reply = response[0].outputs[0].text.strip()
54
+
55
+ # Simulate streaming by printing one character at a time.
56
+ for char in bot_reply:
57
+ print(char, end="", flush=True)
58
+ time.sleep(0.02) # Adjust delay (in seconds) as desired.
59
+ print() # Newline after bot reply.
60
+
61
+ # Append the bot reply to conversation history.
62
+ conversation += bot_reply + "\n"
63
+
64
+ if __name__ == "__main__":
65
+ main()
66
+ ```