miike-ai
/

qwen-14b-coder-fp8

compressed-tensors

Model card Files Files and versions

qwen-14b-coder-fp8 / README.md

miike-ai's picture

Create README.md

ee98774 verified 5 months ago

|

history blame contribute delete

2.04 kB

	---
	base_model:
	- Qwen/Qwen2.5-Coder-14B-Instruct
	---

	```python
	#!/usr/bin/env python3
	import time
	from vllm import LLM, SamplingParams

	def main():
	# Hard-coded model and tensor parallel configuration.
	model_path = "miike-ai/qwen-14b-coder-fp8"
	tensor_parallel_size = 1

	# Define sampling parameters with an increased max_tokens and a stop string.
	sampling_params = SamplingParams(
	temperature=0.0,
	top_p=0.95,
	max_tokens=32000, # Increase this to allow longer responses.
	stop=["\nUser:"], # Stop when the model outputs a new user marker.
	)

	print(f"Loading model '{model_path}' ...")
	model = LLM(
	model=model_path,
	enforce_eager=True,
	dtype="auto",
	tensor_parallel_size=tensor_parallel_size,
	)
	print("Model loaded. You can now chat!")
	print("Type 'exit' or 'quit' to end the conversation.\n")

	conversation = ""
	while True:
	try:
	user_input = input("User: ").strip()
	except (KeyboardInterrupt, EOFError):
	print("\nExiting chat.")
	break

	if user_input.lower() in {"exit", "quit"}:
	print("Exiting chat.")
	break

	# Append the user's input to the conversation history.
	conversation += f"User: {user_input}\nBot: "
	print("Bot: ", end="", flush=True)

	# Generate a response using the conversation history and sampling parameters.
	response = model.generate(conversation, sampling_params=sampling_params)
	# Extract the generated reply.
	bot_reply = response[0].outputs[0].text.strip()

	# Simulate streaming by printing one character at a time.
	for char in bot_reply:
	print(char, end="", flush=True)
	time.sleep(0.02) # Adjust delay (in seconds) as desired.
	print() # Newline after bot reply.

	# Append the bot reply to conversation history.
	conversation += bot_reply + "\n"

	if __name__ == "__main__":
	main()
	```