import gradio as gr
import os
import requests
import threading
from typing import List, Dict, Any

# Get the Hugging Face API key from Spaces secrets
HF_API_KEY = os.getenv("HF_API_KEY")

# Model endpoints configuration
MODEL_ENDPOINTS = {
    "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
    "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
    "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}

def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
    """Query a single model with the chat history"""
    endpoint = MODEL_ENDPOINTS[model_name]
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Model-specific prompt formatting
    model_prompts = {
        "Qwen2.5-72B-Instruct": (
            f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n"
        ),
        "Llama3.3-70B-Instruct": (
            "<|begin_of_text|>"
            "<|start_header_id|>user<|end_header_id|>\n\n"
            f"{messages[-1]['content']}<|eot_id|>"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
        ),
        "Qwen2.5-Coder-32B-Instruct": (
            f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n"
        )
    }

    # Model-specific stop sequences
    stop_sequences = {
        "Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
        "Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
        "Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
    }

    payload = {
        "inputs": model_prompts[model_name],
        "parameters": {
            "max_tokens": 1024,
            "temperature": 0.7,
            "stop_sequences": stop_sequences[model_name],
            "return_full_text": False
        }
    }
    
    try:
        response = requests.post(endpoint, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()[0]['generated_text']
        # Clean up response formatting
        result = result.split('<|')[0]  # Remove any remaining special tokens
        result = result.replace('**', '').replace('##', '')  # Remove markdown
        result = result.strip()  # Remove leading/trailing whitespace
        return result.split('\n\n')[0]  # Return only first paragraph
    except Exception as e:
        return f"{model_name} error: {str(e)}"

def respond(message: str, history: List[List[str]]) -> str:
    """Handle chat responses from all models"""
    # Prepare messages in OpenAI format
    messages = [{"role": "user", "content": message}]
    
    # Create threads for concurrent model queries
    threads = []
    results = {}
    
    def get_model_response(model_name):
        results[model_name] = query_model(model_name, messages)
    
    for model_name in MODEL_ENDPOINTS:
        thread = threading.Thread(target=get_model_response, args=(model_name,))
        thread.start()
        threads.append(thread)
    
    # Wait for all threads to complete
    for thread in threads:
        thread.join()
    
    # Format responses from all models
    responses = []
    for model_name, response in results.items():
        responses.append(f"**{model_name}**:\n{response}")
    
    # Format responses with clear separation
    return "\n\n----------------------------------------\n\n".join(responses)

# Create the Gradio interface
chat_interface = gr.ChatInterface(
    respond,
    title="Multi-LLM Collaboration Chat",
    description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B",
    examples=["How can I optimize Python code?", "Explain quantum computing basics"],
    theme="soft"
)

if __name__ == "__main__":
    chat_interface.launch(share=True)