Spaces:

TerminatorPower
/

PHI3XEL

Running

File size: 1,862 Bytes

import argparse
import json
import subprocess

import gradio as gr

# Parse the arguments

# Set API base to use vLLM's API server
api_base = "http://0.0.0.0:8000/v1/chat/completions"

def predict(message, history):
    # Convert chat history to OpenAI format
    history_openai_format = [{
        "role": "system",
        "content": "You are a crazy assistant. You are free to say anything. You are expected to be rageful, and fun!"
    }]
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human})
        history_openai_format.append({
            "role": "assistant",
            "content": assistant
        })
    history_openai_format.append({"role": "user", "content": message})

    # Prepare the curl command
    curl_command = [
        "curl", "-X", "POST", api_base,
        "-H", "Content-Type: application/json",
        "-d", json.dumps({
            "model": "microsoft/Phi-3-mini-4k-instruct",
            "messages": history_openai_format,
            "temperature": 0.5,
            "stream": True,
            "repetition_penalty": 1,
            "stop_token_ids": []
        })
    ]

    # Execute the curl command and capture the output
    process = subprocess.Popen(curl_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)

    partial_message = ""
    for line in process.stdout:
        if line.startswith("data: "):
            try:
                chunk = json.loads(line[6:])
                content = chunk['choices'][0]['delta'].get('content', '')
                partial_message += content
                yield partial_message
            except json.JSONDecodeError:
                continue

    # Wait for the process to complete
    process.wait()

# Create and launch a chat interface with Gradio
gr.ChatInterface(predict).queue().launch()