import gradio as gr
import os
import requests
import threading
from datetime import datetime
from typing import List, Dict, Any, Generator
from session_manager import SessionManager

# Initialize session manager and get HF API key
session_manager = SessionManager()
HF_API_KEY = os.getenv("HF_API_KEY")

# Model configurations
MODEL_ENDPOINTS = {
    "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
    "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
    "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}

MODEL_CONTEXT_WINDOWS = {
    "Qwen2.5-72B-Instruct": 128000,
    "Llama3.3-70B-Instruct": 128000,
    "Qwen2.5-Coder-32B-Instruct": 128000,
}

MODEL_MAX_TOKENS = {
    "Qwen2.5-72B-Instruct": 8192,
    "Llama3.3-70B-Instruct": 2048,
    "Qwen2.5-Coder-32B-Instruct": 8192,
}

def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
    """Query a single model with the chat history"""
    endpoint = MODEL_ENDPOINTS[model_name]
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Build full conversation history for context
    conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
    
    # Model-specific prompt formatting with full history
    model_prompts = {
        "Qwen2.5-72B-Instruct": (
            f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n"
            "<|im_start|>assistant\nMy analysis:"
        ),
        "Llama3.3-70B-Instruct": (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
            f"Build upon this discussion:\n{conversation}<|eot_id|>\n"
            "<|start_header_id|>assistant<|end_header_id|>\nMy contribution:"
        ),
        "Qwen2.5-Coder-32B-Instruct": (
            f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n"
            "<|im_start|>assistant\nTechnical perspective:"
        )
    }

    # Model-specific stop sequences
    stop_sequences = {
        "Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
        "Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
        "Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
    }

    payload = {
        "inputs": model_prompts[model_name],
        "parameters": {
            "max_tokens": MODEL_MAX_TOKENS[model_name],
            "temperature": 0.6,
            "stop_sequences": stop_sequences[model_name],
            "return_full_text": False
        }
    }
    
    try:
        response = requests.post(endpoint, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()[0]['generated_text']
        # Clean up response formatting
        result = result.split('<|')[0]  # Remove any remaining special tokens
        result = result.replace('**', '').replace('##', '')  # Remove markdown
        result = result.strip()  # Remove leading/trailing whitespace
        return result  # Return complete response
    except Exception as e:
        return f"{model_name} error: {str(e)}"

def respond(message: str, history: List[List[str]], session_id: str) -> Generator[str, None, None]:
    """Handle sequential model responses with context preservation"""
    # Load or initialize session
    session = session_manager.load_session(session_id)
    if not isinstance(session, dict) or "history" not in session:
        session = {"history": []}
    
    # Build context from session history
    messages = []
    for entry in session["history"]:
        if entry["type"] == "user":
            messages.append({"role": "user", "content": entry["content"]})
        else:
            messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "user",
        "content": message
    })
    
    # First model
    yield "🔵 Qwen2.5-Coder-32B-Instruct is thinking..."
    response1 = query_model("Qwen2.5-Coder-32B-Instruct", messages)
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "assistant",
        "model": "Qwen2.5-Coder-32B-Instruct",
        "content": response1
    })
    messages.append({"role": "assistant", "content": f"Qwen2.5-Coder-32B-Instruct: {response1}"})
    yield f"🔵 **Qwen2.5-Coder-32B-Instruct**\n{response1}"
    
    # Second model
    yield f"🔵 **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 Qwen2.5-72B-Instruct is thinking..."
    response2 = query_model("Qwen2.5-72B-Instruct", messages)
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "assistant",
        "model": "Qwen2.5-72B-Instruct",
        "content": response2
    })
    messages.append({"role": "assistant", "content": f"Qwen2.5-72B-Instruct: {response2}"})
    yield f"🔵 **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}"
    
    # Final model
    yield f"🔵 **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}\n\n🟡 Llama3.3-70B-Instruct is thinking..."
    response3 = query_model("Llama3.3-70B-Instruct", messages)
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "assistant",
        "model": "Llama3.3-70B-Instruct",
        "content": response3
    })
    messages.append({"role": "assistant", "content": f"Llama3.3-70B-Instruct: {response3}"})
    
    # Save final session state
    session_manager.save_session(session_id, session)
    
    # Return final combined response
    yield f"🔵 **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}\n\n🟡 **Llama3.3-70B-Instruct**\n{response3}"

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Multi-LLM Collaboration Chat")
    
    with gr.Row():
        session_id = gr.State(session_manager.create_session)
        new_session = gr.Button("🔄 New Session")
    
    chatbot = gr.Chatbot(height=600)
    msg = gr.Textbox(label="Message")
    
    def on_new_session():
        new_id = session_manager.create_session()
        return new_id, []
    
    def user(message, history, session_id):
        return "", history + [[message, None]]
    
    def bot(history, session_id):
        if history and history[-1][1] is None:
            message = history[-1][0]
            for response in respond(message, history[:-1], session_id):
                history[-1][1] = response
                yield history
    
    msg.submit(user, [msg, chatbot, session_id], [msg, chatbot]).then(
        bot, [chatbot, session_id], [chatbot]
    )
    new_session.click(on_new_session, None, [session_id, chatbot])

if __name__ == "__main__":
    demo.launch(share=True)