miike-ai
/

qwen-14b-coder-fp8

compressed-tensors

Model card Files Files and versions

miike-ai commited on Mar 25

Commit

ee98774

·

verified ·

1 Parent(s): fe501b0

Create README.md

Files changed (1) hide show

README.md +66 -0

README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+base_model:
+- Qwen/Qwen2.5-Coder-14B-Instruct
+---
+```python
+#!/usr/bin/env python3
+import time
+from vllm import LLM, SamplingParams
+def main():
+    # Hard-coded model and tensor parallel configuration.
+    model_path = "miike-ai/qwen-14b-coder-fp8"
+    tensor_parallel_size = 1
+    # Define sampling parameters with an increased max_tokens and a stop string.
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        top_p=0.95,
+        max_tokens=32000,       # Increase this to allow longer responses.
+        stop=["\nUser:"],     # Stop when the model outputs a new user marker.
+    )
+    print(f"Loading model '{model_path}' ...")
+    model = LLM(
+        model=model_path,
+        enforce_eager=True,
+        dtype="auto",
+        tensor_parallel_size=tensor_parallel_size,
+    )
+    print("Model loaded. You can now chat!")
+    print("Type 'exit' or 'quit' to end the conversation.\n")
+    conversation = ""
+    while True:
+        try:
+            user_input = input("User: ").strip()
+        except (KeyboardInterrupt, EOFError):
+            print("\nExiting chat.")
+            break
+        if user_input.lower() in {"exit", "quit"}:
+            print("Exiting chat.")
+            break
+        # Append the user's input to the conversation history.
+        conversation += f"User: {user_input}\nBot: "
+        print("Bot: ", end="", flush=True)
+        # Generate a response using the conversation history and sampling parameters.
+        response = model.generate(conversation, sampling_params=sampling_params)
+        # Extract the generated reply.
+        bot_reply = response[0].outputs[0].text.strip()
+        # Simulate streaming by printing one character at a time.
+        for char in bot_reply:
+            print(char, end="", flush=True)
+            time.sleep(0.02)  # Adjust delay (in seconds) as desired.
+        print()  # Newline after bot reply.
+        # Append the bot reply to conversation history.
+        conversation += bot_reply + "\n"
+if __name__ == "__main__":
+    main()
+```