# app.py import json, requests, gradio as gr API_URL = "http://0.0.0.0:8000/v1/chat/completions" def stream_completion(message, history, max_tokens, temperature, top_p, beta): """Gradio callback: stream the assistant’s reply token-by-token.""" # -------- build OpenAI-style message list (no system prompt) ------------- messages = [{"role": "user", "content": u} # past user turns if i % 2 == 0 else # even idx → user {"role": "assistant", "content": u} # odd idx → assistant for i, (u, _) in enumerate(sum(([h[0], h[1]] for h in history), [])) if u] # drop empty strings messages.append({"role": "user", "content": message}) payload = { "model": "Qwen/Qwen3-4B", "messages": messages, "temperature": temperature, "top_p": top_p, "max_tokens": int(max_tokens), "stream": True, } headers = { "Content-Type": "application/json", "X-MIXINPUTS-BETA": str(beta), } try: with requests.post(API_URL, json=payload, stream=True, headers=headers, timeout=(10, None)) as resp: resp.raise_for_status() assistant = "" # iterate over the HTTP chunks for raw in resp.iter_lines(decode_unicode=True, delimiter=b"\n"): if not raw: continue if raw.startswith("data: "): data = raw[6:] # strip the 'data: ' prefix else: data = raw if data.strip() == "[DONE]": break delta = json.loads(data)["choices"][0]["delta"].get("content", "") assistant += delta yield history + [(message, assistant)] # live update in Gradio except Exception as err: yield history + [(message, f"[ERROR] {err}")] # ---------------------------- UI -------------------------------------------- with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo: gr.Markdown( "## 🎨 Mixture of Inputs (MoI) Demo \n" "Streaming vLLM demo with dynamic **beta** adjustment in MoI " "(higher beta → less blending)." ) with gr.Row(): # sliders first beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI β") temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top-p") max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens") chatbot = gr.Chatbot(height=450) user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False) clear_btn = gr.Button("Clear chat") user_box.submit( fn=stream_completion, inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta], outputs=chatbot, ) clear_btn.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.launch()