from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import httpx
import asyncio
import gradio as gr
import os
from dotenv import load_dotenv
import spaces

load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

global_data = {
    'models': {},
    'tokens': {
        'eos': 'eos_token',
        'pad': 'pad_token',
        'padding': 'padding_token',
        'unk': 'unk_token',
        'bos': 'bos_token',
        'sep': 'sep_token',
        'cls': 'cls_token',
        'mask': 'mask_token'
    }
}

model_configs = [
    {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
    {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
    {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
    {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
    {"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
    {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
    {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
    {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
    {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
    {"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
    {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
]

class ModelManager:
    def __init__(self):
        self.models = {}

    def load_model(self, model_config):
        if model_config['name'] not in self.models:
            try:
                print(f"Loading model {model_config['name']}...")
                self.models[model_config['name']] = Llama.from_pretrained(
                    repo_id=model_config['repo_id'], 
                    filename=model_config['filename'], 
                    use_auth_token=HUGGINGFACE_TOKEN
                )
                print(f"Model {model_config['name']} loaded successfully.")
            except Exception as e:
                print(f"Error loading model {model_config['name']}: {e}")

    def load_all_models(self):
        with ThreadPoolExecutor() as executor:
            for config in model_configs:
                executor.submit(self.load_model, config)
        return self.models

model_manager = ModelManager()
global_data['models'] = model_manager.load_all_models()

class ChatRequest(BaseModel):
    message: str

def normalize_input(input_text):
    return input_text.strip()

def remove_duplicates(text):
    text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
    text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
    text = text.replace('[/INST]', '')
    lines = text.split('\n')
    unique_lines = []
    seen_lines = set()
    for line in lines:
        if line not in seen_lines:
            unique_lines.append(line)
            seen_lines.add(line)
    return '\n'.join(unique_lines)

@spaces.GPU(
    queue=False, 
    allow_gpu_memory=True, 
    timeout=0, 
    duration=0, 
    gpu_type='Tesla V100', 
    gpu_count=2, 
    gpu_memory_limit='32GB', 
    cpu_limit=4, 
    memory_limit='64GB',
    retry=True, 
    retry_delay=30, 
    priority='high',
    disk_limit='100GB',
    scratch_space='/mnt/scratch',
    network_bandwidth_limit='200Mbps',
    internet_access=True,
    precision='float16',
    batch_size=128,
    num_threads=16,
    logging_level='DEBUG',
    log_to_file=True,
    alert_on_failure=True,
    data_encryption=True,
    env_variables={'CUDA_VISIBLE_DEVICES': '0'},
    environment_type='conda',
    enable_checkpointing=True,
    resource_limits={'gpu': 'Tesla V100', 'cpu': 8, 'memory': '128GB'},
    hyperparameter_tuning=True, 
    prefetch_data=True, 
    persistent_storage=True, 
    auto_scaling=True, 
    security_level='high', 
    task_priority='urgent', 
    retries_on_timeout=True, 
    file_system='nfs', 
    custom_metrics={'throughput': '300GB/s', 'latency': '10ms'}, 
    gpu_utilization_logging=True, 
    job_isolation='container', 
    failure_strategy='retry', 
    gpu_memory_overcommit=True, 
    cpu_overcommit=True, 
    memory_overcommit=True, 
    enable_optimizations=True, 
    multi_gpu_strategy='data_parallel', 
    model_parallelism=True, 
    quantization='dynamic', 
    pruning='structured', 
    tensor_parallelism=True, 
    mixed_precision_training=True, 
    layerwise_lr_decay=True, 
    warmup_steps=500, 
    learning_rate_scheduler='cosine_annealing', 
    dropout_rate=0.3, 
    weight_decay=0.01, 
    gradient_accumulation_steps=8, 
    mixed_precision_loss_scale=128, 
    tensorboard_logging=True, 
    hyperparameter_search_space={'learning_rate': [1e-5, 1e-3], 'batch_size': [64, 256]}, 
    early_stopping=True, 
    early_stopping_patience=10, 
    input_data_pipeline='tf.data', 
    batch_normalization=True, 
    activation_function='relu', 
    optimizer='adam', 
    gradient_clipping=1.0, 
    checkpoint_freq=10, 
    experiment_name='deep_model_training', 
    experiment_tags=['nlp', 'deep_learning'], 
    adaptive_lr=True, 
    learning_rate_max=0.01, 
    learning_rate_min=1e-6, 
    max_steps=100000, 
    tolerance=0.01, 
    logging_frequency=10, 
    profile_gpu=True, 
    profile_cpu=True, 
    debug_mode=True, 
    save_best_model=True, 
    evaluation_metric='accuracy', 
    job_preemption='enabled', 
    preemptible_resources=True, 
    grace_period=60, 
    resource_scheduling='fifo', 
    hyperparameter_optimization_algorithm='bayesian', 
    distributed_training=True, 
    multi_node_training=True, 
    max_retries=5, 
    log_level='INFO', 
    secure_socket_layer=True, 
    data_sharding=True, 
    distributed_optimizer='horovod', 
    mixed_precision_support=True, 
    fault_tolerance=True, 
    external_gpu_resources=True, 
    disk_cache=True, 
    backup_enabled=True, 
    backup_frequency='daily', 
    task_grouping='dynamic', 
    instance_type='high_memory', 
    instance_count=3, 
    task_runtime='hours', 
    adaptive_memory_allocation=True, 
    model_versioning=True, 
    multi_model_support=True, 
    batch_optimization=True, 
    memory_prefetch=True, 
    data_prefetch_threads=16, 
    network_optimization=True, 
    model_parallelism_strategy='pipeline', 
    verbose_logging=True, 
    lock_on_failure=True, 
    data_compression=True, 
    inference_mode='batch', 
    distributed_cache_enabled=True, 
    dynamic_batching=True, 
    model_deployment=True, 
    latency_optimization=True, 
    multi_region_deployment=True, 
    multi_user_support=True, 
    job_scheduling='auto', 
    max_job_count=100, 
    suspend_on_idle=True, 
    hyperparameter_search_algorithm='random', 
    job_priority_scaling=True, 
    quantum_computing_support=True, 
    dynamic_resource_scaling=True, 
    runtime_optimization=True, 
    checkpoint_interval='30min', 
    max_gpu_temperature=80, 
    scale_on_gpu_utilization=True, 
    worker_threads=8
)
def generate_model_response(model, inputs):
    try:
        print(f"Generating response for model: {model}")
        response = model(inputs)
        print(f"Response from {model}: {response}")
        return remove_duplicates(response['choices'][0]['text'])
    except Exception as e:
        print(f"Error generating model response from {model}: {e}")
        return "Error generating response."

def remove_repetitive_responses(responses):
    unique_responses = {}
    for response in responses:
        if response['model'] not in unique_responses:
            unique_responses[response['model']] = response['response']
    return unique_responses

async def process_message(message):
    inputs = normalize_input(message)
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(generate_model_response, model, inputs)
            for model in global_data['models'].values()
        ]
        responses = []
        for future in as_completed(futures):
            try:
                response = future.result()
                responses.append(response)
            except Exception as e:
                print(f"Error with model: {e}")
                responses.append("Error generating response.")  # O un mensaje predeterminado de error

    unique_responses = remove_repetitive_responses(responses)
    formatted_response = ""
    for model, response in unique_responses.items():
        formatted_response += f"**{model}:**\n{response}\n\n"

    curl_command = f"""
    curl -X POST -H "Content-Type: application/json" \\
         -d '{{"message": "{message}"}}' \\
         http://localhost:7860/generate
    """
    return formatted_response, curl_command


iface = gr.Interface(
    fn=process_message,
    inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
    outputs=[gr.Markdown(), gr.Textbox(label="cURL command")],
    title="Multi-Model LLM API",
    description="Enter a message and get responses from multiple LLMs.",
)

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    iface.launch(server_port=port)