import gradio as gr
import pandas as pd
import torch
import os
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import spaces  # Import the spaces library

# Initialize logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to load and process data
def load_data(csv_file):
    try:
        df = pd.read_csv(csv_file)
        logger.info(f"CSV columns: {df.columns.tolist()}")
        logger.info(f"Total rows in CSV: {len(df)}")
        return df
    except Exception as e:
        logger.error(f"Error loading CSV: {e}")
        return None

# Function to prepare dataset
def prepare_dataset(df, teacher_col, student_col, num_samples=100):
    # Extract and format data
    logger.info(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
    
    formatted_data = []
    for i in range(min(num_samples, len(df))):
        teacher_text = str(df.iloc[i][teacher_col])
        student_text = str(df.iloc[i][student_col])
        
        # Create prompt
        formatted_text = f"### Teacher: {teacher_text}\n### Student: {student_text}"
        formatted_data.append({"text": formatted_text})
    
    logger.info(f"Created {len(formatted_data)} formatted examples")
    
    # Create dataset
    dataset = Dataset.from_list(formatted_data)
    
    # Split dataset
    train_val_split = dataset.train_test_split(test_size=0.1, seed=42)
    
    return train_val_split

# Function to tokenize data
def tokenize_data(dataset, tokenizer, max_length=512):
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=max_length,
            padding="max_length"
        )
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

# Main fine-tuning function with memory optimizations
def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None):
    """
    Fine-tune a model with optimized memory settings to prevent CUDA OOM errors.
    """
    logger.info(f"Using model: {model_id}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # ============ MEMORY OPTIMIZATION 1: REDUCED BATCH SIZE ============
    # A smaller batch size dramatically reduces memory usage during training
    actual_batch_size = 8 if batch_size is None else min(batch_size, 8)
    logger.info(f"Using batch size: {actual_batch_size} (reduced from original to save memory)")
    
    # ============ MEMORY OPTIMIZATION 2: 8-bit QUANTIZATION ============
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        load_in_8bit=True,  # Use 8-bit quantization to reduce memory usage
        device_map="auto",  # Automatically handle model distribution
        use_cache=False,    # Disable KV cache which uses extra memory
        torch_dtype=torch.float16,  # Use lower precision
    )
    
    # Count model parameters
    logger.info(f"Model parameters: {model.num_parameters():,}")
    
    # Prepare model for training with quantization
    model = prepare_model_for_kbit_training(model)
    
    # ============ MEMORY OPTIMIZATION 3: GRADIENT CHECKPOINTING ============
    model.gradient_checkpointing_enable()
    logger.info("Gradient checkpointing enabled: trading computation for memory savings")
    
    # ============ MEMORY OPTIMIZATION 4: OPTIMIZED LORA CONFIG ============
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=4,              # REDUCED from default 8/16 to save memory
        lora_alpha=16,    # Scaling factor
        lora_dropout=0.1, # Dropout probability for regularization
        target_modules=["q_proj", "v_proj"],  # Only attention query and value projections
    )
    logger.info("Using optimized LoRA parameters with reduced rank (r=4) and targeted modules")
    
    # Apply LoRA adapters to the model
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()  # Print trainable parameters info
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        # ============ MEMORY OPTIMIZATION 5: REDUCED BATCH SIZE IN ARGS ============
        per_device_train_batch_size=actual_batch_size,
        per_device_eval_batch_size=actual_batch_size,
        # ============ MEMORY OPTIMIZATION 6: MIXED PRECISION TRAINING ============
        fp16=True,  # Use FP16 for mixed precision training
        # ============ MEMORY OPTIMIZATION 7: GRADIENT ACCUMULATION ============
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
        # ============ MEMORY OPTIMIZATION 8: GRADIENT CHECKPOINTING IN ARGS ============
        gradient_checkpointing=True,
        # Other parameters
        logging_steps=10,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        learning_rate=2e-4,
        weight_decay=0.01,
        warmup_ratio=0.03,
        # ============ MEMORY OPTIMIZATION 9: REDUCED OPTIMIZER OVERHEAD ============
        optim="adamw_torch_fused",  # More memory-efficient optimizer
        # ============ MEMORY OPTIMIZATION 10: REDUCED LOGGING MEMORY ============
        report_to="none",  # Disable extra logging to save memory
    )
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data["train"],
        eval_dataset=train_data["validation"],
        tokenizer=tokenizer,
    )
    
    # ============ MEMORY OPTIMIZATION 11: MANAGE CUDA CACHE ============
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        logger.info("CUDA cache cleared before training")
    
    # Start training
    logger.info("Starting training...")
    trainer.train()
    
    # Save the model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    logger.info(f"Model saved to {output_dir}")
    
    return model, tokenizer

# Gradio interface functions
def process_csv(file, teacher_col, student_col, num_samples):
    df = load_data(file.name)
    if df is None:
        return "Error loading CSV file"
    return f"CSV loaded successfully with {len(df)} rows"

def start_fine_tuning(file, teacher_col, student_col, model_id, epochs, batch_size, num_samples):
    try:
        # Load and process data
        df = load_data(file.name)
        if df is None:
            return "Error loading CSV file"
        
        # Prepare dataset
        dataset = prepare_dataset(df, teacher_col, student_col, num_samples=int(num_samples))
        
        # Load tokenizer for preprocessing
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Tokenize dataset
        tokenized_dataset = {
            "train": tokenize_data(dataset["train"], tokenizer),
            "validation": tokenize_data(dataset["test"], tokenizer),
        }
        
        # Create output directory
        output_dir = "./fine_tuned_model"
        os.makedirs(output_dir, exist_ok=True)
        
        # Finetune model with memory optimizations
        finetune_model(
            model_id=model_id,
            train_data=tokenized_dataset,
            output_dir=output_dir,
            epochs=int(epochs),
            batch_size=int(batch_size),
        )
        
        return "Fine-tuning completed successfully!"
    
    except Exception as e:
        logger.error(f"Error during fine-tuning: {e}")
        return f"Error during fine-tuning: {str(e)}"

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Teacher-Student Bot Fine-Tuning")
    
    with gr.Tab("Upload Data"):
        file_input = gr.File(label="Upload CSV File")
        with gr.Row():
            teacher_col = gr.Textbox(label="Teacher Column", value="Unnamed: 0")
            student_col = gr.Textbox(label="Student Column", value="idx")
        num_samples = gr.Slider(label="Number of Samples", minimum=10, maximum=1000, value=100, step=10)
        upload_btn = gr.Button("Process CSV")
        csv_output = gr.Textbox(label="CSV Processing Result")
        upload_btn.click(process_csv, inputs=[file_input, teacher_col, student_col, num_samples], outputs=csv_output)
    
    with gr.Tab("Fine-Tune"):
        model_id = gr.Textbox(label="Model ID", value="mistralai/Mistral-7B-v0.1")
        with gr.Row():
            batch_size = gr.Number(label="Batch Size", value=8, info="Recommended: 8 or lower for 7B models")
            epochs = gr.Number(label="Number of Epochs", value=2)
        
        training_btn = gr.Button("Start Fine-Tuning")
        training_output = gr.Textbox(label="Training Progress")
        
        training_btn.click(
            start_fine_tuning,
            inputs=[file_input, teacher_col, student_col, model_id, epochs, batch_size, num_samples],
            outputs=training_output
        )

# Launch the app - REMOVED the spaces.zero.mount() call that was causing the error
demo.queue().launch(debug=True)