import gradio as gr
from openai import OpenAI
import os
import requests
import json

ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client for HF Inference
hf_client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("HF Inference OpenAI client initialized.")

# Cerebras API endpoint
CEREBRAS_API_URL = "https://router.huggingface.co/cerebras/v1/chat/completions"

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model,
    provider  # New parameter for provider selection
):
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Selected model (custom_model): {custom_model}")
    print(f"Selected provider: {provider}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Prepare messages for API
    messages = [{"role": "system", "content": system_message}]
    print("Initial messages array constructed.")

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})
    print("Latest user message appended.")

    # If user provided a model, use that; otherwise, fall back to a default model
    model_to_use = custom_model.strip() if custom_model.strip() != "" else "meta-llama/Llama-3.3-70B-Instruct"
    print(f"Model selected for inference: {model_to_use}")

    # Start with an empty string to build the response as tokens stream in
    response = ""
    
    # Handle different providers
    if provider == "hf-inference":
        print("Using HF Inference API.")
        # Use the OpenAI client for HF Inference
        for message_chunk in hf_client.chat.completions.create(
            model=model_to_use,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            seed=seed,
            messages=messages,
        ):
            token_text = message_chunk.choices[0].delta.content
            if token_text is not None:  # Handle None values that might come in stream
                print(f"Received token: {token_text}")
                response += token_text
                yield response
    
    elif provider == "cerebras":
        print("Using Cerebras API via HF Router.")
        
        # Prepare headers and payload for the Cerebras API
        headers = {
            "Authorization": f"Bearer {ACCESS_TOKEN}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": model_to_use,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "frequency_penalty": frequency_penalty,
            "stream": True
        }
        
        if seed is not None:
            payload["seed"] = seed
        
        # Make the streaming request to Cerebras
        with requests.post(
            CEREBRAS_API_URL,
            headers=headers,
            json=payload,
            stream=True
        ) as req:
            # Handle Server-Sent Events (SSE) format
            for line in req.iter_lines():
                if line:
                    # Skip the "data: " prefix
                    if line.startswith(b'data: '):
                        line = line[6:]
                    
                    # Skip "[DONE]" message
                    if line == b'[DONE]':
                        continue
                    
                    try:
                        # Parse the JSON chunk
                        chunk = json.loads(line)
                        token_text = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
                        
                        if token_text:
                            print(f"Received Cerebras token: {token_text}")
                            response += token_text
                            yield response
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON: {e}, Line: {line}")
                        continue
    
    print("Completed response generation.")

# GRADIO UI

chatbot = gr.Chatbot(height=600, show_copy_button=True, placeholder="Select a model and begin chatting", layout="panel")
print("Chatbot interface created.")

system_message_box = gr.Textbox(value="", placeholder="You are a helpful assistant.", label="System Prompt")

max_tokens_slider = gr.Slider(
    minimum=1,
    maximum=4096,
    value=512,
    step=1,
    label="Max new tokens"
)
temperature_slider = gr.Slider(
    minimum=0.1,
    maximum=4.0,
    value=0.7,
    step=0.1,
    label="Temperature"
)
top_p_slider = gr.Slider(
    minimum=0.1,
    maximum=1.0,
    value=0.95,
    step=0.05,
    label="Top-P"
)
frequency_penalty_slider = gr.Slider(
    minimum=-2.0,
    maximum=2.0,
    value=0.0,
    step=0.1,
    label="Frequency Penalty"
)
seed_slider = gr.Slider(
    minimum=-1,
    maximum=65535,
    value=-1,
    step=1,
    label="Seed (-1 for random)"
)

# The custom_model_box is what the respond function sees as "custom_model"
custom_model_box = gr.Textbox(
    value="",
    label="Custom Model",
    info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model.",
    placeholder="meta-llama/Llama-3.3-70B-Instruct"
)

# New provider selection radio
provider_radio = gr.Radio(
    choices=["hf-inference", "cerebras"],
    value="hf-inference",
    label="Inference Provider",
    info="Select which inference provider to use"
)

def set_custom_model_from_radio(selected):
    """
    This function will get triggered whenever someone picks a model from the 'Featured Models' radio.
    We will update the Custom Model text box with that selection automatically.
    """
    print(f"Featured model selected: {selected}")
    return selected

demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        system_message_box,
        max_tokens_slider,
        temperature_slider,
        top_p_slider,
        frequency_penalty_slider,
        seed_slider,
        custom_model_box,
        provider_radio,  # Add provider selection to inputs
    ],
    fill_height=True,
    chatbot=chatbot,
    theme="Nymbo/Nymbo_Theme",
)
print("ChatInterface object created.")

with demo:
    with gr.Accordion("Model Selection", open=False):
        model_search_box = gr.Textbox(
            label="Filter Models",
            placeholder="Search for a featured model...",
            lines=1
        )
        print("Model search box created.")

        models_list = [
            "meta-llama/Llama-3.3-70B-Instruct",
            "meta-llama/Llama-3.1-70B-Instruct",
            "meta-llama/Llama-3.0-70B-Instruct",
            "meta-llama/Llama-3.2-3B-Instruct",
            "meta-llama/Llama-3.2-1B-Instruct",
            "meta-llama/Llama-3.1-8B-Instruct",
            "NousResearch/Hermes-3-Llama-3.1-8B",
            "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
            "mistralai/Mistral-Nemo-Instruct-2407",
            "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "mistralai/Mistral-7B-Instruct-v0.3",
            "mistralai/Mistral-7B-Instruct-v0.2",
            "Qwen/Qwen3-235B-A22B",
            "Qwen/Qwen3-32B",
            "Qwen/Qwen2.5-72B-Instruct",
            "Qwen/Qwen2.5-3B-Instruct",
            "Qwen/Qwen2.5-0.5B-Instruct",
            "Qwen/QwQ-32B",
            "Qwen/Qwen2.5-Coder-32B-Instruct",
            "microsoft/Phi-3.5-mini-instruct",
            "microsoft/Phi-3-mini-128k-instruct",
            "microsoft/Phi-3-mini-4k-instruct",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
            "HuggingFaceH4/zephyr-7b-beta",
            "HuggingFaceTB/SmolLM2-360M-Instruct",
            "tiiuae/falcon-7b-instruct",
            "01-ai/Yi-1.5-34B-Chat",
        ]
        print("Models list initialized.")

        featured_model_radio = gr.Radio(
            label="Select a model below",
            choices=models_list,
            value="meta-llama/Llama-3.3-70B-Instruct",
            interactive=True
        )
        print("Featured models radio button created.")

        def filter_models(search_term):
            print(f"Filtering models with search term: {search_term}")
            filtered = [m for m in models_list if search_term.lower() in m.lower()]
            print(f"Filtered models: {filtered}")
            return gr.update(choices=filtered)

        model_search_box.change(
            fn=filter_models,
            inputs=model_search_box,
            outputs=featured_model_radio
        )
        print("Model search box change event linked.")

        featured_model_radio.change(
            fn=set_custom_model_from_radio,
            inputs=featured_model_radio,
            outputs=custom_model_box
        )
        print("Featured model radio button change event linked.")
    
    # Add new accordion for advanced settings including provider selection
    with gr.Accordion("Advanced Settings", open=False):
        # The provider_radio is already defined above, we're just adding it to the UI here
        gr.Markdown("### Inference Provider")
        gr.Markdown("Select which provider to use for inference. Default is Hugging Face Inference API.")
        # Provider radio is already included in the additional_inputs
        gr.Markdown("Note: Different providers may support different models and parameters.")

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch(show_api=True)