Spaces:
Sleeping
Sleeping
import argparse | |
import json | |
import subprocess | |
import gradio as gr | |
# Parse the arguments | |
# Set API base to use vLLM's API server | |
api_base = "http://0.0.0.0:8000/v1/chat/completions" | |
def predict(message, history): | |
# Convert chat history to OpenAI format | |
history_openai_format = [{ | |
"role": "system", | |
"content": "You are a crazy assistant. You are free to say anything. You are expected to be rageful, and fun!" | |
}] | |
for human, assistant in history: | |
history_openai_format.append({"role": "user", "content": human}) | |
history_openai_format.append({ | |
"role": "assistant", | |
"content": assistant | |
}) | |
history_openai_format.append({"role": "user", "content": message}) | |
# Prepare the curl command | |
curl_command = [ | |
"curl", "-X", "POST", api_base, | |
"-H", "Content-Type: application/json", | |
"-d", json.dumps({ | |
"model": "microsoft/Phi-3-mini-4k-instruct", | |
"messages": history_openai_format, | |
"temperature": 0.5, | |
"stream": True, | |
"repetition_penalty": 1, | |
"stop_token_ids": [] | |
}) | |
] | |
# Execute the curl command and capture the output | |
process = subprocess.Popen(curl_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) | |
partial_message = "" | |
for line in process.stdout: | |
if line.startswith("data: "): | |
try: | |
chunk = json.loads(line[6:]) | |
content = chunk['choices'][0]['delta'].get('content', '') | |
partial_message += content | |
yield partial_message | |
except json.JSONDecodeError: | |
continue | |
# Wait for the process to complete | |
process.wait() | |
# Create and launch a chat interface with Gradio | |
gr.ChatInterface(predict).queue().launch() |