import gradio as gr
import requests
import sseclient
import os

API_URL = "http://localhost:8000/v1/chat/completions"

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    beta,
):
    # Build message history
    messages = [{"role": "system", "content": system_message}]
    for user, assistant in history:
        if user:
            messages.append({"role": "user", "content": user})
        if assistant:
            messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})

    # Prepare request payload
    payload = {
        "model": "Qwen/Qwen3-4B",  # Update to your actual model if needed
        "messages": messages,
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens,
        "stream": True,
    }

    # Optional: send beta as a custom OpenAI field
    headers = {
        "Content-Type": "application/json",
        "X-MIXINPUTS-BETA": str(beta),  # or modify your vLLM code to read this
    }

    # Stream response using SSE (Server-Sent Events)
    try:
        response = requests.post(API_URL, json=payload, stream=True, headers=headers)
        response.raise_for_status()
        client = sseclient.SSEClient(response)

        full_text = ""
        for event in client.events():
            if event.data == "[DONE]":
                break
            delta = event.json()["choices"][0]["delta"].get("content", "")
            full_text += delta
            yield full_text

    except Exception as e:
        yield f"[ERROR] {e}"

# UI layout using ChatInterface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant using Mixture of Inputs.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
        gr.Slider(minimum=0.0, maximum=10.0, value=1.0, step=0.1, label="MoI Beta"),
    ],
    title="🧪 Mixture of Inputs (MoI) Demo",
    description="Streaming local vLLM demo with dynamic MoI beta adjustment.",
)

if __name__ == "__main__":
    demo.launch()