Spaces:
Sleeping
Sleeping
File size: 2,380 Bytes
4c07c1e a846510 f873ce7 a846510 4c07c1e a846510 4c07c1e f873ce7 4c07c1e f873ce7 a846510 f873ce7 a846510 f873ce7 a846510 4c07c1e f873ce7 a846510 f873ce7 a846510 f873ce7 4c07c1e f873ce7 4c07c1e f873ce7 4c07c1e f873ce7 a846510 f873ce7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
import requests
import sseclient
import os
API_URL = "http://localhost:8000/v1/chat/completions"
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
beta,
):
# Build message history
messages = [{"role": "system", "content": system_message}]
for user, assistant in history:
if user:
messages.append({"role": "user", "content": user})
if assistant:
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
# Prepare request payload
payload = {
"model": "Qwen/Qwen3-4B", # Update to your actual model if needed
"messages": messages,
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens,
"stream": True,
}
# Optional: send beta as a custom OpenAI field
headers = {
"Content-Type": "application/json",
"X-MIXINPUTS-BETA": str(beta), # or modify your vLLM code to read this
}
# Stream response using SSE (Server-Sent Events)
try:
response = requests.post(API_URL, json=payload, stream=True, headers=headers)
response.raise_for_status()
client = sseclient.SSEClient(response)
full_text = ""
for event in client.events():
if event.data == "[DONE]":
break
delta = event.json()["choices"][0]["delta"].get("content", "")
full_text += delta
yield full_text
except Exception as e:
yield f"[ERROR] {e}"
# UI layout using ChatInterface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant using Mixture of Inputs.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
gr.Slider(minimum=0.0, maximum=10.0, value=1.0, step=0.1, label="MoI Beta"),
],
title="🧪 Mixture of Inputs (MoI) Demo",
description="Streaming local vLLM demo with dynamic MoI beta adjustment.",
)
if __name__ == "__main__":
demo.launch() |