# app.py
import json, requests, gradio as gr

API_URL = "http://0.0.0.0:8000/v1/chat/completions"

def stream_completion(message, history, max_tokens, temperature, top_p, beta):
    """Gradio callback: stream the assistant’s reply token-by-token."""
    # -------- build OpenAI-style message list (no system prompt) -------------
    messages = [{"role": "user", "content": u}         # past user turns
                if i % 2 == 0 else                    # even idx → user
                {"role": "assistant", "content": u}    # odd  idx → assistant
                for i, (u, _) in enumerate(sum(([h[0], h[1]] for h in history), []))
                if u]                                  # drop empty strings
    messages.append({"role": "user", "content": message})

    payload = {
        "model": "Qwen/Qwen3-4B",
        "messages": messages,
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": int(max_tokens),
        "stream": True,
    }
    headers = {
        "Content-Type": "application/json",
        "X-MIXINPUTS-BETA": str(beta),
    }

    try:
        with requests.post(API_URL,
                           json=payload,
                           stream=True,
                           headers=headers,
                           timeout=(10, None)) as resp:
            resp.raise_for_status()

            assistant = ""
            # iterate over the HTTP chunks
            for raw in resp.iter_lines(decode_unicode=True, delimiter=b"\n"):
                if not raw:
                    continue
                if raw.startswith("data: "):
                    data = raw[6:]                 # strip the 'data: ' prefix
                else:
                    data = raw

                if data.strip() == "[DONE]":
                    break

                delta = json.loads(data)["choices"][0]["delta"].get("content", "")
                assistant += delta
                yield history + [(message, assistant)]  # live update in Gradio
    except Exception as err:
        yield history + [(message, f"[ERROR] {err}")]

# ---------------------------- UI --------------------------------------------
with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
    gr.Markdown(
        "## 🎨 Mixture of Inputs (MoI) Demo  \n"
        "Streaming vLLM demo with dynamic **beta** adjustment in MoI "
        "(higher beta → less blending)."
    )

    with gr.Row():  # sliders first
        beta         = gr.Slider(0.0, 10.0, value=1.0,  step=0.1,  label="MoI β")
        temperature  = gr.Slider(0.1, 1.0,  value=0.6,  step=0.1,  label="Temperature")
        top_p        = gr.Slider(0.1, 1.0,  value=0.80, step=0.05, label="Top-p")
        max_tokens   = gr.Slider(1,   2048, value=512,  step=1,    label="Max new tokens")

    chatbot   = gr.Chatbot(height=450)
    user_box  = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
    clear_btn = gr.Button("Clear chat")

    user_box.submit(
        fn=stream_completion,
        inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
        outputs=chatbot,
    )
    clear_btn.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch()