Spaces:

amaltese
/

avatartestspace

Runtime error

App Files Files Community

amaltese commited on Feb 26

Commit

a4467aa

verified ·

1 Parent(s): de78a7b

Update app.py

Browse files

Files changed (1) hide show

app.py +205 -260

app.py CHANGED Viewed

@@ -1,307 +1,252 @@
 import gradio as gr
-import os
-import torch
-import json
 import pandas as pd
 from datasets import Dataset
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TrainingArguments,
-    Trainer,
-    DataCollatorForLanguageModeling
-)
-from peft import (
-    LoraConfig,
-    get_peft_model,
-    prepare_model_for_kbit_training,
-    PeftModel
-)
-import spaces
-from huggingface_hub import login
-# Set environment variable for cache directory
-os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
-os.makedirs('/tmp/hf_cache', exist_ok=True)
-# Get token from environment variable and log in
-hf_token = os.environ.get("HF_TOKEN")
-if hf_token:
-    login(token=hf_token)
-    print("Successfully logged in to Hugging Face Hub")
-else:
-    print("No Hugging Face token found. You may encounter access issues with gated models.")
-def sample_from_csv(csv_file, sample_size=100):
-    """Sample from CSV file and format for training"""
-    df = pd.read_csv(csv_file)
-    # Display CSV info
-    print(f"CSV columns: {df.columns.tolist()}")
-    print(f"Total rows in CSV: {len(df)}")
-    # Try to identify teacher and student columns
-    teacher_col = None
-    student_col = None
-    for col in df.columns:
-        col_lower = col.lower()
-        if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower:
-            teacher_col = col
-        elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower:
-            student_col = col
-    # If we couldn't identify columns, use the first two
-    if teacher_col is None or student_col is None:
-        teacher_col = df.columns[0]
-        student_col = df.columns[1]
-        print(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
-    else:
-        print(f"Identified columns: {teacher_col} (teacher) and {student_col} (student)")
-    # Sample rows
-    if sample_size >= len(df):
-        sampled_df = df
-    else:
-        sampled_df = df.sample(n=sample_size, random_state=42)
-    # Format data
-    texts = []
-    for _, row in sampled_df.iterrows():
-        teacher_text = str(row[teacher_col]).strip()
-        student_text = str(row[student_col]).strip()
-        # Skip rows with empty values
-        if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan':
-            continue
-        # Format according to the document format:
-        # <s> [INST] Teacher ** <Dialogue> [/INST] Student** <Dialogue> </s>
-        formatted_text = f"<s> [INST] Teacher ** {teacher_text} [/INST] Student** {student_text} </s>"
-        texts.append(formatted_text)
-    print(f"Created {len(texts)} formatted examples")
-    return Dataset.from_dict({"text": texts})
-@spaces.GPU
-def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()):
-    """Fine-tune the model and return results"""
-    # Check GPU
-    if torch.cuda.is_available():
-        print(f"GPU available: {torch.cuda.get_device_name(0)}")
-        device = torch.device("cuda")
-    else:
-        print("No GPU available, fine-tuning will be extremely slow!")
-        device = torch.device("cpu")
-    # Sample data
-    progress(0.1, "Sampling data from CSV...")
-    dataset = sample_from_csv(csv_file, sample_size)
     # Split dataset
-    dataset_split = dataset.train_test_split(test_size=0.1)
-    # Load tokenizer
-    progress(0.2, "Loading tokenizer...")
-    # Use only the original Mistral model
-    model_name = "mistralai/Mistral-7B-v0.1"
-    print(f"Using model: {model_name}")
-    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
-    tokenizer.pad_token = tokenizer.eos_token
-    # Tokenize dataset
     def tokenize_function(examples):
-        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
-    progress(0.3, "Tokenizing dataset...")
-    tokenized_datasets = dataset_split.map(tokenize_function, batched=True)
-    # Load model with LoRA configuration
-    progress(0.4, "Loading model...")
-    lora_config = LoraConfig(
-        r=8,
-        lora_alpha=16,
-        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
-        lora_dropout=0.05,
-        bias="none",
-        task_type="CAUSAL_LM"
-    )
     model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.float16,
-        device_map="auto",
-        token=hf_token,
     )
-    # Prepare model for LoRA training
     model = prepare_model_for_kbit_training(model)
-    model = get_peft_model(model, lora_config)
-    # Print model info
-    print(f"Model loaded: {model_name}")
-    model_params = sum(p.numel() for p in model.parameters())
-    print(f"Model parameters: {model_params:,}")
-    # Training arguments
-    output_dir = "mistral7b_finetuned"
     training_args = TrainingArguments(
         output_dir=output_dir,
-        num_train_epochs=num_epochs,
-        per_device_train_batch_size=1,
-        gradient_accumulation_steps=4,
-        save_steps=50,
         logging_steps=10,
         learning_rate=2e-4,
-        weight_decay=0.001,
-        fp16=True,
-        warmup_steps=50,
-        lr_scheduler_type="cosine",
-        report_to="none",  # Disable wandb
     )
-    # Initialize trainer
-    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     trainer = Trainer(
         model=model,
         args=training_args,
-        train_dataset=tokenized_datasets["train"],
-        eval_dataset=tokenized_datasets["test"],
-        data_collator=data_collator,
     )
-    # Train model
-    progress(0.5, "Training model...")
     trainer.train()
-    # Save model
-    progress(0.9, "Saving model...")
-    trainer.model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
-    # Test with sample prompts
-    progress(0.95, "Testing model...")
-    test_prompts = [
-        "How was the Math exam?",
-        "Good morning students! How are you all?",
-        "What should you do if you get into a fight with a friend?",
-        "Did you complete your science project?",
-        "What did you learn in class today?"
-    ]
-    # Load the fine-tuned model for inference
-    fine_tuned_model = PeftModel.from_pretrained(
-        model,
-        output_dir,
-        device_map="auto",
-    )
-    # Generate responses
-    results = []
-    for prompt in test_prompts:
-        formatted_prompt = f"<s> [INST] Teacher ** {prompt} [/INST] Student**"
-        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
-        with torch.no_grad():
-            outputs = fine_tuned_model.generate(
-                **inputs,
-                max_length=200,
-                temperature=0.7,
-                top_p=0.95,
-                do_sample=True,
-            )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        student_part = response.split("Student**")[1].strip() if "Student**" in response else response
-        results.append({
-            "prompt": prompt,
-            "response": student_part
-        })
-    # Save results
-    with open("test_results.json", "w") as f:
-        json.dump(results, f, indent=2)
-    progress(1.0, "Completed!")
-    return results
-# Define Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot")
-    with gr.Tab("System Check"):
-        check_btn = gr.Button("Check GPU and Authentication Status")
-        system_output = gr.Textbox(label="System Status", lines=5)
-        @spaces.GPU
-        def check_system():
-            status = []
-            # Check GPU
-            if torch.cuda.is_available():
-                status.append(f"✅ GPU AVAILABLE: {torch.cuda.get_device_name(0)}")
-                gpu_memory = f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
-                status.append(gpu_memory)
-            else:
-                status.append("❌ NO GPU DETECTED.")
-            # Check HF token
-            if os.environ.get("HF_TOKEN"):
-                status.append("✅ Hugging Face token found")
-            else:
-                status.append("❌ No Hugging Face token found. You may encounter access issues with gated models.")
-            # Check if we can access Mistral model
-            try:
-                from huggingface_hub import model_info
-                info = model_info("mistralai/Mistral-7B-v0.1", token=hf_token)
-                status.append(f"✅ Access to Mistral-7B-v0.1 model verified: {info.modelId}")
-            except Exception as e:
-                status.append(f"❌ Cannot access Mistral-7B-v0.1 model: {str(e)}")
-            return "\n".join(status)
-        check_btn.click(check_system, inputs=[], outputs=[system_output])
-    with gr.Tab("Fine-tune Model"):
-        with gr.Row():
-            csv_input = gr.File(label="Upload Teacher-Student CSV")
-        with gr.Row():
-            sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size")
-            epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
         with gr.Row():
-            start_btn = gr.Button("Start Fine-Tuning")
         with gr.Row():
-            output = gr.JSON(label="Results")
-        start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output])
-    with gr.Tab("About"):
-        gr.Markdown("""
-        ## Fine-Tuning Mistral 7B for Student Bot
-        This app fine-tunes the original Mistral-7B-v0.1 model to respond like a student to teacher prompts.
-        ### Requirements
-        - CSV file with teacher-student conversation pairs
-        - GPU acceleration (provided by this Space)
-        - Hugging Face authentication for accessing Mistral-7B-v0.1 (which is a gated model)
-        ### Process
-        1. Upload your CSV file
-        2. Set sample size and number of epochs
-        3. Click "Start Fine-Tuning"
-        4. View test results with sample prompts
-        ### Important Notes
-        - Fine-tuning can take several hours depending on your sample size and epochs
-        - The model will be saved in the Space and can be downloaded for further use
-        """)
-# Launch app
-demo.launch()

 import gradio as gr
 import pandas as pd
+import torch
+import os
 from datasets import Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
+from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
+import spaces  # Import the spaces library for HF Spaces integration
+# Initialize logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Function to load and process data
+def load_data(csv_file):
+    try:
+        df = pd.read_csv(csv_file)
+        logger.info(f"CSV columns: {df.columns.tolist()}")
+        logger.info(f"Total rows in CSV: {len(df)}")
+        return df
+    except Exception as e:
+        logger.error(f"Error loading CSV: {e}")
+        return None
+# Function to prepare dataset
+def prepare_dataset(df, teacher_col, student_col, num_samples=100):
+    # Extract and format data
+    logger.info(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
+    formatted_data = []
+    for i in range(min(num_samples, len(df))):
+        teacher_text = str(df.iloc[i][teacher_col])
+        student_text = str(df.iloc[i][student_col])
+        # Create prompt
+        formatted_text = f"### Teacher: {teacher_text}\n### Student: {student_text}"
+        formatted_data.append({"text": formatted_text})
+    logger.info(f"Created {len(formatted_data)} formatted examples")
+    # Create dataset
+    dataset = Dataset.from_list(formatted_data)
     # Split dataset
+    train_val_split = dataset.train_test_split(test_size=0.1, seed=42)
+    return train_val_split
+# Function to tokenize data
+def tokenize_data(dataset, tokenizer, max_length=512):
     def tokenize_function(examples):
+        return tokenizer(
+            examples["text"],
+            truncation=True,
+            max_length=max_length,
+            padding="max_length"
+        )
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    return tokenized_dataset
+# Main fine-tuning function with memory optimizations
+def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None):
+    """
+    Fine-tune a model with optimized memory settings to prevent CUDA OOM errors.
+    """
+    logger.info(f"Using model: {model_id}")
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # ============ MEMORY OPTIMIZATION 1: REDUCED BATCH SIZE ============
+    # A smaller batch size dramatically reduces memory usage during training
+    # For 7B models on limited VRAM (40GB), values between 1-8 are recommended
+    actual_batch_size = 8 if batch_size is None else min(batch_size, 8)
+    logger.info(f"Using batch size: {actual_batch_size} (reduced from original to save memory)")
+    # ============ MEMORY OPTIMIZATION 2: 8-bit QUANTIZATION ============
+    # Load model in 8-bit to reduce memory footprint during training
     model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        load_in_8bit=True,  # Use 8-bit quantization to reduce memory usage
+        device_map="auto",  # Automatically handle model distribution
+        use_cache=False,    # Disable KV cache which uses extra memory
+        torch_dtype=torch.float16,  # Use lower precision
     )
+    # Count model parameters
+    logger.info(f"Model parameters: {model.num_parameters():,}")
+    # Prepare model for training with quantization
     model = prepare_model_for_kbit_training(model)
+    # ============ MEMORY OPTIMIZATION 3: GRADIENT CHECKPOINTING ============
+    # Enable gradient checkpointing to trade compute for memory
+    # This recomputes forward activations during backward pass instead of storing them
+    model.gradient_checkpointing_enable()
+    logger.info("Gradient checkpointing enabled: trading computation for memory savings")
+    # ============ MEMORY OPTIMIZATION 4: OPTIMIZED LORA CONFIG ============
+    # Use lower rank and fewer modules to reduce memory requirements
+    peft_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        inference_mode=False,
+        r=4,              # REDUCED from default 8/16 to save memory
+        lora_alpha=16,    # Scaling factor
+        lora_dropout=0.1, # Dropout probability for regularization
+        # Target specific modules instead of all linear layers to save memory
+        target_modules=["q_proj", "v_proj"],  # Only attention query and value projections
+    )
+    logger.info("Using optimized LoRA parameters with reduced rank (r=4) and targeted modules")
+    # Apply LoRA adapters to the model
+    model = get_peft_model(model, peft_config)
+    model.print_trainable_parameters()  # Print trainable parameters info
+    # Define training arguments
     training_args = TrainingArguments(
         output_dir=output_dir,
+        num_train_epochs=epochs,
+        # ============ MEMORY OPTIMIZATION 5: REDUCED BATCH SIZE IN ARGS ============
+        per_device_train_batch_size=actual_batch_size,
+        per_device_eval_batch_size=actual_batch_size,
+        # ============ MEMORY OPTIMIZATION 6: MIXED PRECISION TRAINING ============
+        # Mixed precision significantly reduces memory usage
+        fp16=True,  # Use FP16 for mixed precision training
+        # ============ MEMORY OPTIMIZATION 7: GRADIENT ACCUMULATION ============
+        # Simulate larger batch sizes without the memory cost
+        gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps (effective batch size = 8*4=32)
+        # ============ MEMORY OPTIMIZATION 8: GRADIENT CHECKPOINTING IN ARGS ============
+        gradient_checkpointing=True,
+        # Other parameters
         logging_steps=10,
+        save_strategy="epoch",
+        evaluation_strategy="epoch",
         learning_rate=2e-4,
+        weight_decay=0.01,
+        warmup_ratio=0.03,
+        # ============ MEMORY OPTIMIZATION 9: REDUCED OPTIMIZER OVERHEAD ============
+        optim="adamw_torch_fused",  # More memory-efficient optimizer
+        # ============ MEMORY OPTIMIZATION 10: REDUCED LOGGING MEMORY ============
+        report_to="none",  # Disable extra logging to save memory
     )
+    # Initialize the Trainer
     trainer = Trainer(
         model=model,
         args=training_args,
+        train_dataset=train_data["train"],
+        eval_dataset=train_data["validation"],
+        tokenizer=tokenizer,
     )
+    # ============ MEMORY OPTIMIZATION 11: MANAGE CUDA CACHE ============
+    # Clear CUDA cache before training to start with a clean memory state
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        logger.info("CUDA cache cleared before training")
+    # Start training
+    logger.info("Starting training...")
     trainer.train()
+    # Save the model
+    model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
+    logger.info(f"Model saved to {output_dir}")
+    return model, tokenizer
+# Gradio interface functions
+def process_csv(file, teacher_col, student_col, num_samples):
+    df = load_data(file.name)
+    if df is None:
+        return "Error loading CSV file"
+    return f"CSV loaded successfully with {len(df)} rows"
+def start_fine_tuning(file, teacher_col, student_col, model_id, epochs, batch_size, num_samples):
+    try:
+        # Load and process data
+        df = load_data(file.name)
+        if df is None:
+            return "Error loading CSV file"
+        # Prepare dataset
+        dataset = prepare_dataset(df, teacher_col, student_col, num_samples=int(num_samples))
+        # Load tokenizer for preprocessing
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Tokenize dataset
+        tokenized_dataset = {
+            "train": tokenize_data(dataset["train"], tokenizer),
+            "validation": tokenize_data(dataset["test"], tokenizer),
+        }
+        # Create output directory
+        output_dir = "./fine_tuned_model"
+        os.makedirs(output_dir, exist_ok=True)
+        # Finetune model with memory optimizations
+        finetune_model(
+            model_id=model_id,
+            train_data=tokenized_dataset,
+            output_dir=output_dir,
+            epochs=int(epochs),
+            batch_size=int(batch_size),
+        )
+        return "Fine-tuning completed successfully!"
+    except Exception as e:
+        logger.error(f"Error during fine-tuning: {e}")
+        return f"Error during fine-tuning: {str(e)}"
+# Create Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Teacher-Student Bot Fine-Tuning")
+    with gr.Tab("Upload Data"):
+        file_input = gr.File(label="Upload CSV File")
         with gr.Row():
+            teacher_col = gr.Textbox(label="Teacher Column", value="Unnamed: 0")
+            student_col = gr.Textbox(label="Student Column", value="idx")
+        num_samples = gr.Slider(label="Number of Samples", minimum=10, maximum=1000, value=100, step=10)
+        upload_btn = gr.Button("Process CSV")
+        csv_output = gr.Textbox(label="CSV Processing Result")
+        upload_btn.click(process_csv, inputs=[file_input, teacher_col, student_col, num_samples], outputs=csv_output)
+    with gr.Tab("Fine-Tune"):
+        model_id = gr.Textbox(label="Model ID", value="mistralai/Mistral-7B-v0.1")
         with gr.Row():
+            batch_size = gr.Number(label="Batch Size", value=8, info="Recommended: 8 or lower for 7B models")
+            epochs = gr.Number(label="Number of Epochs", value=2)
+        training_btn = gr.Button("Start Fine-Tuning")
+        training_output = gr.Textbox(label="Training Progress")
+        training_btn.click(
+            start_fine_tuning,
+            inputs=[file_input, teacher_col, student_col, model_id, epochs, batch_size, num_samples],
+            outputs=training_output
+        )
+# Launch the Space
+spaces.zero.mount()
+demo.queue().launch(debug=True)