import gradio as gr import requests import os import spaces from server import setup_mixinputs, launch_vllm_server API_URL = "http://localhost:8000/v1/chat/completions" def chat_with_moi(message, history, temperature, top_p, beta): # Set the MIXINPUTS_BETA env var *per request* os.environ["MIXINPUTS_BETA"] = str(beta) # setup_mixinputs() # launch_vllm_server(beta=beta) payload = { "model": "Qwen/Qwen3-4B", # match what your vLLM server expects "messages": [{"role": "user", "content": message}], "temperature": temperature, "top_p": top_p, "max_tokens": 512, } try: response = requests.post(API_URL, json=payload) response.raise_for_status() return response.json()["choices"][0]["message"]["content"] except Exception as e: return f"[ERROR] {str(e)}" # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 🧪 Mixture of Inputs (MoI) Demo with vLLM") with gr.Row(): temperature = gr.Slider(0.0, 1.5, value=0.7, label="Temperature") top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p") beta = gr.Slider(0.0, 10.0, value=1.0, label="MoI Beta") chatbot = gr.Chatbot() message = gr.Textbox(label="Your message") send_btn = gr.Button("Send") history = gr.State([]) def respond(user_message, chat_history, temperature, top_p, beta): reply = chat_with_moi(user_message, chat_history, temperature, top_p, beta) chat_history = chat_history + [(user_message, reply)] return chat_history, chat_history send_btn.click(respond, inputs=[message, history, temperature, top_p, beta], outputs=[chatbot, history]) demo.launch()