Spaces:

TerminatorPower
/

PHI3XEL

Sleeping

PHI3XEL / vllm_openai_client.py

Upload folder using huggingface_hub

03e6c2e verified 8 months ago

1.86 kB

	import argparse
	import json
	import subprocess

	import gradio as gr

	# Parse the arguments

	# Set API base to use vLLM's API server
	api_base = "http://0.0.0.0:8000/v1/chat/completions"

	def predict(message, history):
	# Convert chat history to OpenAI format
	history_openai_format = [{
	"role": "system",
	"content": "You are a crazy assistant. You are free to say anything. You are expected to be rageful, and fun!"
	}]
	for human, assistant in history:
	history_openai_format.append({"role": "user", "content": human})
	history_openai_format.append({
	"role": "assistant",
	"content": assistant
	})
	history_openai_format.append({"role": "user", "content": message})

	# Prepare the curl command
	curl_command = [
	"curl", "-X", "POST", api_base,
	"-H", "Content-Type: application/json",
	"-d", json.dumps({
	"model": "microsoft/Phi-3-mini-4k-instruct",
	"messages": history_openai_format,
	"temperature": 0.5,
	"stream": True,
	"repetition_penalty": 1,
	"stop_token_ids": []
	})
	]

	# Execute the curl command and capture the output
	process = subprocess.Popen(curl_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)

	partial_message = ""
	for line in process.stdout:
	if line.startswith("data: "):
	try:
	chunk = json.loads(line[6:])
	content = chunk['choices'][0]['delta'].get('content', '')
	partial_message += content
	yield partial_message
	except json.JSONDecodeError:
	continue

	# Wait for the process to complete
	process.wait()

	# Create and launch a chat interface with Gradio
	gr.ChatInterface(predict).queue().launch()