Spaces:

amaltese
/

avatartestspace

Runtime error

App Files Files Community

amaltese commited on Feb 26

Commit

f3b5025

verified ·

1 Parent(s): e198d69

Update app.py

Browse files

Files changed (1) hide show

app.py +303 -11

app.py CHANGED Viewed

@@ -1,8 +1,30 @@
-# At the top of your file, add:
 import os
 from huggingface_hub import login
-# Get token from environment variable
 hf_token = os.environ.get("HF_TOKEN")
 if hf_token:
     login(token=hf_token)
@@ -10,13 +32,283 @@ if hf_token:
 else:
     print("No Hugging Face token found. You may encounter access issues with gated models.")
-# Then modify your model loading code to include the token:
-tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
-# And later:
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    token=hf_token
-)

+import gradio as gr
 import os
+import torch
+import json
+import pandas as pd
+from datasets import Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    PeftModel
+)
+import spaces
 from huggingface_hub import login
+# Set environment variable for cache directory
+os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
+os.makedirs('/tmp/hf_cache', exist_ok=True)
+# Get token from environment variable and log in
 hf_token = os.environ.get("HF_TOKEN")
 if hf_token:
     login(token=hf_token)
 else:
     print("No Hugging Face token found. You may encounter access issues with gated models.")
+def sample_from_csv(csv_file, sample_size=100):
+    """Sample from CSV file and format for training"""
+    df = pd.read_csv(csv_file)
+    # Display CSV info
+    print(f"CSV columns: {df.columns.tolist()}")
+    print(f"Total rows in CSV: {len(df)}")
+    # Try to identify teacher and student columns
+    teacher_col = None
+    student_col = None
+    for col in df.columns:
+        col_lower = col.lower()
+        if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower:
+            teacher_col = col
+        elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower:
+            student_col = col
+    # If we couldn't identify columns, use the first two
+    if teacher_col is None or student_col is None:
+        teacher_col = df.columns[0]
+        student_col = df.columns[1]
+        print(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
+    else:
+        print(f"Identified columns: {teacher_col} (teacher) and {student_col} (student)")
+    # Sample rows
+    if sample_size >= len(df):
+        sampled_df = df
+    else:
+        sampled_df = df.sample(n=sample_size, random_state=42)
+    # Format data
+    texts = []
+    for _, row in sampled_df.iterrows():
+        teacher_text = str(row[teacher_col]).strip()
+        student_text = str(row[student_col]).strip()
+        # Skip rows with empty values
+        if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan':
+            continue
+        # Format according to the document format:
+        # <s> [INST] Teacher ** <Dialogue> [/INST] Student** <Dialogue> </s>
+        formatted_text = f"<s> [INST] Teacher ** {teacher_text} [/INST] Student** {student_text} </s>"
+        texts.append(formatted_text)
+    print(f"Created {len(texts)} formatted examples")
+    return Dataset.from_dict({"text": texts})
+@spaces.GPU
+def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()):
+    """Fine-tune the model and return results"""
+    # Check GPU
+    if torch.cuda.is_available():
+        print(f"GPU available: {torch.cuda.get_device_name(0)}")
+        device = torch.device("cuda")
+    else:
+        print("No GPU available, fine-tuning will be extremely slow!")
+        device = torch.device("cpu")
+    # Sample data
+    progress(0.1, "Sampling data from CSV...")
+    dataset = sample_from_csv(csv_file, sample_size)
+    # Split dataset
+    dataset_split = dataset.train_test_split(test_size=0.1)
+    # Load tokenizer
+    progress(0.2, "Loading tokenizer...")
+    # Try the non-gated Mistral model first
+    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+        print(f"Successfully loaded tokenizer for {model_name}")
+    except Exception as e:
+        print(f"Error loading {model_name}: {e}")
+        print("Falling back to original Mistral model with token authentication...")
+        model_name = "mistralai/Mistral-7B-v0.1"
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+    tokenizer.pad_token = tokenizer.eos_token
+    # Tokenize dataset
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
+    progress(0.3, "Tokenizing dataset...")
+    tokenized_datasets = dataset_split.map(tokenize_function, batched=True)
+    # Load model with LoRA configuration
+    progress(0.4, "Loading model...")
+    lora_config = LoraConfig(
+        r=8,
+        lora_alpha=16,
+        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        token=hf_token,
+    )
+    # Prepare model for LoRA training
+    model = prepare_model_for_kbit_training(model)
+    model = get_peft_model(model, lora_config)
+    # Print model info
+    print(f"Model loaded: {model_name}")
+    model_params = sum(p.numel() for p in model.parameters())
+    print(f"Model parameters: {model_params:,}")
+    # Training arguments
+    output_dir = "mistral7b_finetuned"
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        num_train_epochs=num_epochs,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=4,
+        save_steps=50,
+        logging_steps=10,
+        learning_rate=2e-4,
+        weight_decay=0.001,
+        fp16=True,
+        warmup_steps=50,
+        lr_scheduler_type="cosine",
+        report_to="none",  # Disable wandb
+    )
+    # Initialize trainer
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["test"],
+        data_collator=data_collator,
+    )
+    # Train model
+    progress(0.5, "Training model...")
+    trainer.train()
+    # Save model
+    progress(0.9, "Saving model...")
+    trainer.model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+    # Test with sample prompts
+    progress(0.95, "Testing model...")
+    test_prompts = [
+        "How was the Math exam?",
+        "Good morning students! How are you all?",
+        "What should you do if you get into a fight with a friend?",
+        "Did you complete your science project?",
+        "What did you learn in class today?"
+    ]
+    # Load the fine-tuned model for inference
+    fine_tuned_model = PeftModel.from_pretrained(
+        model,
+        output_dir,
+        device_map="auto",
+    )
+    # Generate responses
+    results = []
+    for prompt in test_prompts:
+        formatted_prompt = f"<s> [INST] Teacher ** {prompt} [/INST] Student**"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = fine_tuned_model.generate(
+                **inputs,
+                max_length=200,
+                temperature=0.7,
+                top_p=0.95,
+                do_sample=True,
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        student_part = response.split("Student**")[1].strip() if "Student**" in response else response
+        results.append({
+            "prompt": prompt,
+            "response": student_part
+        })
+    # Save results
+    with open("test_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+    progress(1.0, "Completed!")
+    return results
+# Define Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot")
+    with gr.Tab("System Check"):
+        check_btn = gr.Button("Check GPU and Authentication Status")
+        system_output = gr.Textbox(label="System Status", lines=5)
+        @spaces.GPU
+        def check_system():
+            status = []
+            # Check GPU
+            if torch.cuda.is_available():
+                status.append(f"✅ GPU AVAILABLE: {torch.cuda.get_device_name(0)}")
+                gpu_memory = f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
+                status.append(gpu_memory)
+            else:
+                status.append("❌ NO GPU DETECTED.")
+            # Check HF token
+            if os.environ.get("HF_TOKEN"):
+                status.append("✅ Hugging Face token found")
+            else:
+                status.append("❌ No Hugging Face token found. You may encounter access issues with gated models.")
+            # Check if we can access Mistral model
+            try:
+                from huggingface_hub import model_info
+                info = model_info("mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
+                status.append(f"✅ Access to Mistral model verified: {info.modelId}")
+            except Exception as e:
+                status.append(f"❌ Cannot access Mistral model: {str(e)}")
+            return "\n".join(status)
+        check_btn.click(check_system, inputs=[], outputs=[system_output])
+    with gr.Tab("Fine-tune Model"):
+        with gr.Row():
+            csv_input = gr.File(label="Upload Teacher-Student CSV")
+        with gr.Row():
+            sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size")
+            epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
+        with gr.Row():
+            start_btn = gr.Button("Start Fine-Tuning")
+        with gr.Row():
+            output = gr.JSON(label="Results")
+        start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output])
+    with gr.Tab("About"):
+        gr.Markdown("""
+        ## Fine-Tuning Mistral 7B for Student Bot
+        This app fine-tunes the Mistral 7B model to respond like a student to teacher prompts.
+        ### Requirements
+        - CSV file with teacher-student conversation pairs
+        - GPU acceleration (provided by this Space)
+        - Hugging Face authentication for accessing gated models
+        ### Process
+        1. Upload your CSV file
+        2. Set sample size and number of epochs
+        3. Click "Start Fine-Tuning"
+        4. View test results with sample prompts
+        ### Important Notes
+        - The app tries to use Mistral-7B-Instruct-v0.2 which is not gated
+        - If that fails, it falls back to the original Mistral-7B-v0.1 model (which requires authentication)
+        - Fine-tuning can take several hours depending on your sample size and epochs
+        """)
+# Launch app
+demo.launch()