Spaces:

amaltese
/

avatartestspace

Runtime error

File size: 7,800 Bytes

import gradio as gr
import os
import torch
import json
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
import spaces

# Set environment variable for cache directory
os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
os.makedirs('/tmp/hf_cache', exist_ok=True)

def sample_from_csv(csv_file, sample_size=100):
    """Sample from CSV file and format for training"""
    df = pd.read_csv(csv_file)
    
    # Display CSV info
    print(f"CSV columns: {df.columns.tolist()}")
    print(f"Total rows in CSV: {len(df)}")
    
    # Try to identify teacher and student columns
    teacher_col = None
    student_col = None
    
    for col in df.columns:
        col_lower = col.lower()
        if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower:
            teacher_col = col
        elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower:
            student_col = col
    
    # If we couldn't identify columns, use the first two
    if teacher_col is None or student_col is None:
        teacher_col = df.columns[0]
        student_col = df.columns[1]
    
    # Sample rows
    if sample_size >= len(df):
        sampled_df = df
    else:
        sampled_df = df.sample(n=sample_size, random_state=42)
    
    # Format data
    texts = []
    for _, row in sampled_df.iterrows():
        teacher_text = str(row[teacher_col]).strip()
        student_text = str(row[student_col]).strip()
        
        # Skip rows with empty values
        if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan':
            continue
            
        # Format according to the document format:
        # <s> [INST] Teacher ** <Dialogue> [/INST] Student** <Dialogue> </s>
        formatted_text = f"<s> [INST] Teacher ** {teacher_text} [/INST] Student** {student_text} </s>"
        texts.append(formatted_text)
    
    return Dataset.from_dict({"text": texts})

@spaces.GPU
def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()):
    """Fine-tune the model and return results"""
    # Check GPU
    if torch.cuda.is_available():
        print(f"GPU available: {torch.cuda.get_device_name(0)}")
        device = torch.device("cuda")
    else:
        print("No GPU available, fine-tuning will be extremely slow!")
        device = torch.device("cpu")
    
    # Sample data
    progress(0.1, "Sampling data from CSV...")
    dataset = sample_from_csv(csv_file, sample_size)
    
    # Split dataset
    dataset_split = dataset.train_test_split(test_size=0.1)
    
    # Load tokenizer
    progress(0.2, "Loading tokenizer...")
    model_name = "mistralai/Mistral-7B-v0.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Tokenize dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    
    progress(0.3, "Tokenizing dataset...")
    tokenized_datasets = dataset_split.map(tokenize_function, batched=True)
    
    # Load model with LoRA configuration
    progress(0.4, "Loading model...")
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    
    # Prepare model for LoRA training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    
    # Training arguments
    output_dir = "mistral7b_finetuned"
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        save_steps=50,
        logging_steps=10,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=True,
        warmup_steps=50,
        lr_scheduler_type="cosine",
        report_to="none",  # Disable wandb
    )
    
    # Initialize trainer
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
    )
    
    # Train model
    progress(0.5, "Training model...")
    trainer.train()
    
    # Save model
    progress(0.9, "Saving model...")
    trainer.model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Test with sample prompts
    progress(0.95, "Testing model...")
    test_prompts = [
        "How was the Math exam?",
        "Good morning students! How are you all?",
        "What should you do if you get into a fight with a friend?",
        "Did you complete your science project?",
        "What did you learn in class today?"
    ]
    
    # Load the fine-tuned model for inference
    fine_tuned_model = PeftModel.from_pretrained(
        model,
        output_dir,
        device_map="auto",
    )
    
    # Generate responses
    results = []
    for prompt in test_prompts:
        formatted_prompt = f"<s> [INST] Teacher ** {prompt} [/INST] Student**"
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = fine_tuned_model.generate(
                **inputs,
                max_length=200,
                temperature=0.7,
                top_p=0.95,
                do_sample=True,
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        student_part = response.split("Student**")[1].strip() if "Student**" in response else response
        
        results.append({
            "prompt": prompt,
            "response": student_part
        })
    
    # Save results
    with open("test_results.json", "w") as f:
        json.dump(results, f, indent=2)
    
    progress(1.0, "Completed!")
    return results

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot")
    
    with gr.Tab("Fine-tune Model"):
        with gr.Row():
            csv_input = gr.File(label="Upload Teacher-Student CSV")
        
        with gr.Row():
            sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size")
            epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
        
        with gr.Row():
            start_btn = gr.Button("Start Fine-Tuning")
        
        with gr.Row():
            output = gr.JSON(label="Results")
        
        start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output])
    
    with gr.Tab("About"):
        gr.Markdown("""
        ## Fine-Tuning Mistral 7B for Student Bot
        
        This app fine-tunes the Mistral 7B model to respond like a student to teacher prompts.
        
        ### Requirements
        - CSV file with teacher-student conversation pairs
        - GPU acceleration (provided by this Space)
        
        ### Process
        1. Upload your CSV file
        2. Set sample size and number of epochs
        3. Click "Start Fine-Tuning"
        4. View test results with sample prompts
        """)

# Launch app
demo.launch()