Spaces:
Runtime error
Runtime error
import gradio as gr | |
import os | |
import torch | |
import json | |
import pandas as pd | |
from datasets import Dataset | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
TrainingArguments, | |
Trainer, | |
DataCollatorForLanguageModeling | |
) | |
from peft import ( | |
LoraConfig, | |
get_peft_model, | |
prepare_model_for_kbit_training, | |
PeftModel | |
) | |
import spaces | |
# Set environment variable for cache directory | |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache' | |
os.makedirs('/tmp/hf_cache', exist_ok=True) | |
def sample_from_csv(csv_file, sample_size=100): | |
"""Sample from CSV file and format for training""" | |
df = pd.read_csv(csv_file) | |
# Display CSV info | |
print(f"CSV columns: {df.columns.tolist()}") | |
print(f"Total rows in CSV: {len(df)}") | |
# Try to identify teacher and student columns | |
teacher_col = None | |
student_col = None | |
for col in df.columns: | |
col_lower = col.lower() | |
if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower: | |
teacher_col = col | |
elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower: | |
student_col = col | |
# If we couldn't identify columns, use the first two | |
if teacher_col is None or student_col is None: | |
teacher_col = df.columns[0] | |
student_col = df.columns[1] | |
# Sample rows | |
if sample_size >= len(df): | |
sampled_df = df | |
else: | |
sampled_df = df.sample(n=sample_size, random_state=42) | |
# Format data | |
texts = [] | |
for _, row in sampled_df.iterrows(): | |
teacher_text = str(row[teacher_col]).strip() | |
student_text = str(row[student_col]).strip() | |
# Skip rows with empty values | |
if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan': | |
continue | |
# Format according to the document format: | |
# <s> [INST] Teacher ** <Dialogue> [/INST] Student** <Dialogue> </s> | |
formatted_text = f"<s> [INST] Teacher ** {teacher_text} [/INST] Student** {student_text} </s>" | |
texts.append(formatted_text) | |
return Dataset.from_dict({"text": texts}) | |
def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()): | |
"""Fine-tune the model and return results""" | |
# Check GPU | |
if torch.cuda.is_available(): | |
print(f"GPU available: {torch.cuda.get_device_name(0)}") | |
device = torch.device("cuda") | |
else: | |
print("No GPU available, fine-tuning will be extremely slow!") | |
device = torch.device("cpu") | |
# Sample data | |
progress(0.1, "Sampling data from CSV...") | |
dataset = sample_from_csv(csv_file, sample_size) | |
# Split dataset | |
dataset_split = dataset.train_test_split(test_size=0.1) | |
# Load tokenizer | |
progress(0.2, "Loading tokenizer...") | |
model_name = "mistralai/Mistral-7B-v0.1" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = tokenizer.eos_token | |
# Tokenize dataset | |
def tokenize_function(examples): | |
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) | |
progress(0.3, "Tokenizing dataset...") | |
tokenized_datasets = dataset_split.map(tokenize_function, batched=True) | |
# Load model with LoRA configuration | |
progress(0.4, "Loading model...") | |
lora_config = LoraConfig( | |
r=8, | |
lora_alpha=16, | |
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
lora_dropout=0.05, | |
bias="none", | |
task_type="CAUSAL_LM" | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
# Prepare model for LoRA training | |
model = prepare_model_for_kbit_training(model) | |
model = get_peft_model(model, lora_config) | |
# Training arguments | |
output_dir = "mistral7b_finetuned" | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=num_epochs, | |
per_device_train_batch_size=1, | |
gradient_accumulation_steps=4, | |
save_steps=50, | |
logging_steps=10, | |
learning_rate=2e-4, | |
weight_decay=0.001, | |
fp16=True, | |
warmup_steps=50, | |
lr_scheduler_type="cosine", | |
report_to="none", # Disable wandb | |
) | |
# Initialize trainer | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_datasets["train"], | |
eval_dataset=tokenized_datasets["test"], | |
data_collator=data_collator, | |
) | |
# Train model | |
progress(0.5, "Training model...") | |
trainer.train() | |
# Save model | |
progress(0.9, "Saving model...") | |
trainer.model.save_pretrained(output_dir) | |
tokenizer.save_pretrained(output_dir) | |
# Test with sample prompts | |
progress(0.95, "Testing model...") | |
test_prompts = [ | |
"How was the Math exam?", | |
"Good morning students! How are you all?", | |
"What should you do if you get into a fight with a friend?", | |
"Did you complete your science project?", | |
"What did you learn in class today?" | |
] | |
# Load the fine-tuned model for inference | |
fine_tuned_model = PeftModel.from_pretrained( | |
model, | |
output_dir, | |
device_map="auto", | |
) | |
# Generate responses | |
results = [] | |
for prompt in test_prompts: | |
formatted_prompt = f"<s> [INST] Teacher ** {prompt} [/INST] Student**" | |
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
outputs = fine_tuned_model.generate( | |
**inputs, | |
max_length=200, | |
temperature=0.7, | |
top_p=0.95, | |
do_sample=True, | |
) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
student_part = response.split("Student**")[1].strip() if "Student**" in response else response | |
results.append({ | |
"prompt": prompt, | |
"response": student_part | |
}) | |
# Save results | |
with open("test_results.json", "w") as f: | |
json.dump(results, f, indent=2) | |
progress(1.0, "Completed!") | |
return results | |
# Define Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot") | |
with gr.Tab("Fine-tune Model"): | |
with gr.Row(): | |
csv_input = gr.File(label="Upload Teacher-Student CSV") | |
with gr.Row(): | |
sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size") | |
epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs") | |
with gr.Row(): | |
start_btn = gr.Button("Start Fine-Tuning") | |
with gr.Row(): | |
output = gr.JSON(label="Results") | |
start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output]) | |
with gr.Tab("About"): | |
gr.Markdown(""" | |
## Fine-Tuning Mistral 7B for Student Bot | |
This app fine-tunes the Mistral 7B model to respond like a student to teacher prompts. | |
### Requirements | |
- CSV file with teacher-student conversation pairs | |
- GPU acceleration (provided by this Space) | |
### Process | |
1. Upload your CSV file | |
2. Set sample size and number of epochs | |
3. Click "Start Fine-Tuning" | |
4. View test results with sample prompts | |
""") | |
# Launch app | |
demo.launch() |