import gradio as gr
import os
from huggingface_hub import InferenceClient
import cohere

# Models, API keys and initialization of API clients
COHERE_MODEL = "command-r-plus"
HF_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
HF_API_KEY = os.getenv("HF_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
client_hf = InferenceClient(model=HF_MODEL, token=HF_API_KEY)
client_cohere = cohere.Client(COHERE_API_KEY)

def respond(
    message: str, 
    history: list[tuple[str, str]], 
    system_message: str, 
    max_tokens: int, 
    temperature: float, 
    top_p: float, 
    use_cohere: bool
):
    """Handles chatbot responses based on user input and chat history.

    This function integrates with either the Cohere API or Hugging Face API to generate AI-based responses.

    Args:
        message (str): The latest user message.
        history (list[tuple[str, str]]): A list of previous exchanges where:
            - Each tuple contains (user_message, assistant_response).
            - Example: [("Hello", "Hi there!"), ("How are you?", "I'm good!")]
        system_message (str): A system-level instruction for the chatbot (e.g., personality, style).
        max_tokens (int): Maximum number of new tokens the model can generate.
        temperature (float): Controls randomness (higher = more varied responses).
        top_p (float): Probability threshold for token selection (higher = more diverse responses).
        use_cohere (bool): If True, uses Cohere API; otherwise, uses Hugging Face API.

    Yields:
        str: The chatbot's response (streamed for Hugging Face, full response for Cohere).
    """
    
    # Constructing the message history for context
    messages = [{"role": "system", "content": system_message}]
    
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": message})  # Append current user message
    
    response = ""

    if use_cohere:  
        # Using Cohere API (no streaming support)
        cohere_response = client_cohere.chat(
            message=message,
            model=COHERE_MODEL,
            temperature=temperature,
            max_tokens=max_tokens
        )
        response = cohere_response.text
        yield response  # Yield full response immediately
    
    else:  
        # Using Hugging Face API (streaming responses)
        for message in client_hf.chat_completion(
            messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            token = message.choices[0].delta.content  # Extract generated token
            response += token
            yield response  # Yield response incrementally

# Gradio UI with user-configurable inputs
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System prompt"),  # System instruction
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),  # Token limit
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),  # Randomness control
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),  # Probability mass
        gr.Checkbox(label="Use capable Cohere model instead."),  # API selection toggle
    ],
)

# Start Gradio interface
if __name__ == "__main__":
    demo.launch()