Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

File size: 2,380 Bytes

4c07c1e
a846510
f873ce7
a846510
4c07c1e
a846510
4c07c1e
f873ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c07c1e
f873ce7
a846510
f873ce7
 
a846510
 
f873ce7
 
a846510
4c07c1e
f873ce7
 
 
 
 
 
 
a846510
f873ce7
a846510
f873ce7
4c07c1e
f873ce7
 
 
 
 
 
 
4c07c1e
f873ce7
 
4c07c1e
f873ce7
 
 
 
 
 
 
 
 
 
 
 
 
a846510
f873ce7

import gradio as gr
import requests
import sseclient
import os

API_URL = "http://localhost:8000/v1/chat/completions"

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    beta,
):
    # Build message history
    messages = [{"role": "system", "content": system_message}]
    for user, assistant in history:
        if user:
            messages.append({"role": "user", "content": user})
        if assistant:
            messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})

    # Prepare request payload
    payload = {
        "model": "Qwen/Qwen3-4B",  # Update to your actual model if needed
        "messages": messages,
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens,
        "stream": True,
    }

    # Optional: send beta as a custom OpenAI field
    headers = {
        "Content-Type": "application/json",
        "X-MIXINPUTS-BETA": str(beta),  # or modify your vLLM code to read this
    }

    # Stream response using SSE (Server-Sent Events)
    try:
        response = requests.post(API_URL, json=payload, stream=True, headers=headers)
        response.raise_for_status()
        client = sseclient.SSEClient(response)

        full_text = ""
        for event in client.events():
            if event.data == "[DONE]":
                break
            delta = event.json()["choices"][0]["delta"].get("content", "")
            full_text += delta
            yield full_text

    except Exception as e:
        yield f"[ERROR] {e}"

# UI layout using ChatInterface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant using Mixture of Inputs.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
        gr.Slider(minimum=0.0, maximum=10.0, value=1.0, step=0.1, label="MoI Beta"),
    ],
    title="🧪 Mixture of Inputs (MoI) Demo",
    description="Streaming local vLLM demo with dynamic MoI beta adjustment.",
)

if __name__ == "__main__":
    demo.launch()