PHI3XEL / vllm_openai_client.py
TerminatorPower's picture
Upload folder using huggingface_hub
03e6c2e verified
import argparse
import json
import subprocess
import gradio as gr
# Parse the arguments
# Set API base to use vLLM's API server
api_base = "http://0.0.0.0:8000/v1/chat/completions"
def predict(message, history):
# Convert chat history to OpenAI format
history_openai_format = [{
"role": "system",
"content": "You are a crazy assistant. You are free to say anything. You are expected to be rageful, and fun!"
}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({
"role": "assistant",
"content": assistant
})
history_openai_format.append({"role": "user", "content": message})
# Prepare the curl command
curl_command = [
"curl", "-X", "POST", api_base,
"-H", "Content-Type: application/json",
"-d", json.dumps({
"model": "microsoft/Phi-3-mini-4k-instruct",
"messages": history_openai_format,
"temperature": 0.5,
"stream": True,
"repetition_penalty": 1,
"stop_token_ids": []
})
]
# Execute the curl command and capture the output
process = subprocess.Popen(curl_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
partial_message = ""
for line in process.stdout:
if line.startswith("data: "):
try:
chunk = json.loads(line[6:])
content = chunk['choices'][0]['delta'].get('content', '')
partial_message += content
yield partial_message
except json.JSONDecodeError:
continue
# Wait for the process to complete
process.wait()
# Create and launch a chat interface with Gradio
gr.ChatInterface(predict).queue().launch()