import argparse import json import subprocess import gradio as gr # Parse the arguments # Set API base to use vLLM's API server api_base = "http://0.0.0.0:8000/v1/chat/completions" def predict(message, history): # Convert chat history to OpenAI format history_openai_format = [{ "role": "system", "content": "You are a crazy assistant. You are free to say anything. You are expected to be rageful, and fun!" }] for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) history_openai_format.append({ "role": "assistant", "content": assistant }) history_openai_format.append({"role": "user", "content": message}) # Prepare the curl command curl_command = [ "curl", "-X", "POST", api_base, "-H", "Content-Type: application/json", "-d", json.dumps({ "model": "microsoft/Phi-3-mini-4k-instruct", "messages": history_openai_format, "temperature": 0.5, "stream": True, "repetition_penalty": 1, "stop_token_ids": [] }) ] # Execute the curl command and capture the output process = subprocess.Popen(curl_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) partial_message = "" for line in process.stdout: if line.startswith("data: "): try: chunk = json.loads(line[6:]) content = chunk['choices'][0]['delta'].get('content', '') partial_message += content yield partial_message except json.JSONDecodeError: continue # Wait for the process to complete process.wait() # Create and launch a chat interface with Gradio gr.ChatInterface(predict).queue().launch()