import gradio as gr
import requests
import os
import spaces

from server import setup_mixinputs, launch_vllm_server

API_URL = "http://localhost:8000/v1/chat/completions"

def chat_with_moi(message, history, temperature, top_p, beta):
    # Set the MIXINPUTS_BETA env var *per request*
    os.environ["MIXINPUTS_BETA"] = str(beta)

    # setup_mixinputs()
    # launch_vllm_server(beta=beta)

    payload = {
        "model": "Qwen/Qwen3-4B",  # match what your vLLM server expects
        "messages": [{"role": "user", "content": message}],
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": 512,
    }

    try:
        response = requests.post(API_URL, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"]
    except Exception as e:
        return f"[ERROR] {str(e)}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🧪 Mixture of Inputs (MoI) Demo with vLLM")

    with gr.Row():
        temperature = gr.Slider(0.0, 1.5, value=0.7, label="Temperature")
        top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
        beta = gr.Slider(0.0, 10.0, value=1.0, label="MoI Beta")

    chatbot = gr.Chatbot()
    message = gr.Textbox(label="Your message")
    send_btn = gr.Button("Send")

    history = gr.State([])

    def respond(user_message, chat_history, temperature, top_p, beta):
        reply = chat_with_moi(user_message, chat_history, temperature, top_p, beta)
        chat_history = chat_history + [(user_message, reply)]
        return chat_history, chat_history

    send_btn.click(respond, inputs=[message, history, temperature, top_p, beta],
                   outputs=[chatbot, history])

demo.launch()