Spaces:

amaltese
/

avatartestspace

Runtime error

App Files Files Community

avatartestspace / app.py

amaltese

Update app.py

de78a7b verified 4 months ago

raw

history blame

10.5 kB

	import gradio as gr
	import os
	import torch
	import json
	import pandas as pd
	from datasets import Dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling
	)
	from peft import (
	LoraConfig,
	get_peft_model,
	prepare_model_for_kbit_training,
	PeftModel
	)
	import spaces
	from huggingface_hub import login

	# Set environment variable for cache directory
	os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
	os.makedirs('/tmp/hf_cache', exist_ok=True)

	# Get token from environment variable and log in
	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)
	print("Successfully logged in to Hugging Face Hub")
	else:
	print("No Hugging Face token found. You may encounter access issues with gated models.")

	def sample_from_csv(csv_file, sample_size=100):
	"""Sample from CSV file and format for training"""
	df = pd.read_csv(csv_file)

	# Display CSV info
	print(f"CSV columns: {df.columns.tolist()}")
	print(f"Total rows in CSV: {len(df)}")

	# Try to identify teacher and student columns
	teacher_col = None
	student_col = None

	for col in df.columns:
	col_lower = col.lower()
	if 'teacher' in col_lower or 'instructor' in col_lower or 'prompt' in col_lower:
	teacher_col = col
	elif 'student' in col_lower or 'response' in col_lower or 'answer' in col_lower:
	student_col = col

	# If we couldn't identify columns, use the first two
	if teacher_col is None or student_col is None:
	teacher_col = df.columns[0]
	student_col = df.columns[1]
	print(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")
	else:
	print(f"Identified columns: {teacher_col} (teacher) and {student_col} (student)")

	# Sample rows
	if sample_size >= len(df):
	sampled_df = df
	else:
	sampled_df = df.sample(n=sample_size, random_state=42)

	# Format data
	texts = []
	for _, row in sampled_df.iterrows():
	teacher_text = str(row[teacher_col]).strip()
	student_text = str(row[student_col]).strip()

	# Skip rows with empty values
	if not teacher_text or not student_text or teacher_text == 'nan' or student_text == 'nan':
	continue

	# Format according to the document format:
	# <s> [INST] Teacher <Dialogue> [/INST] Student <Dialogue> </s>
	formatted_text = f"<s> [INST] Teacher {teacher_text} [/INST] Student {student_text} </s>"
	texts.append(formatted_text)

	print(f"Created {len(texts)} formatted examples")
	return Dataset.from_dict({"text": texts})

	@spaces.GPU
	def finetune_model(csv_file, sample_size=100, num_epochs=3, progress=gr.Progress()):
	"""Fine-tune the model and return results"""
	# Check GPU
	if torch.cuda.is_available():
	print(f"GPU available: {torch.cuda.get_device_name(0)}")
	device = torch.device("cuda")
	else:
	print("No GPU available, fine-tuning will be extremely slow!")
	device = torch.device("cpu")

	# Sample data
	progress(0.1, "Sampling data from CSV...")
	dataset = sample_from_csv(csv_file, sample_size)

	# Split dataset
	dataset_split = dataset.train_test_split(test_size=0.1)

	# Load tokenizer
	progress(0.2, "Loading tokenizer...")

	# Use only the original Mistral model
	model_name = "mistralai/Mistral-7B-v0.1"
	print(f"Using model: {model_name}")

	tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
	tokenizer.pad_token = tokenizer.eos_token

	# Tokenize dataset
	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

	progress(0.3, "Tokenizing dataset...")
	tokenized_datasets = dataset_split.map(tokenize_function, batched=True)

	# Load model with LoRA configuration
	progress(0.4, "Loading model...")
	lora_config = LoraConfig(
	r=8,
	lora_alpha=16,
	target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM"
	)

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map="auto",
	token=hf_token,
	)

	# Prepare model for LoRA training
	model = prepare_model_for_kbit_training(model)
	model = get_peft_model(model, lora_config)

	# Print model info
	print(f"Model loaded: {model_name}")
	model_params = sum(p.numel() for p in model.parameters())
	print(f"Model parameters: {model_params:,}")

	# Training arguments
	output_dir = "mistral7b_finetuned"
	training_args = TrainingArguments(
	output_dir=output_dir,
	num_train_epochs=num_epochs,
	per_device_train_batch_size=1,
	gradient_accumulation_steps=4,
	save_steps=50,
	logging_steps=10,
	learning_rate=2e-4,
	weight_decay=0.001,
	fp16=True,
	warmup_steps=50,
	lr_scheduler_type="cosine",
	report_to="none", # Disable wandb
	)

	# Initialize trainer
	data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["test"],
	data_collator=data_collator,
	)

	# Train model
	progress(0.5, "Training model...")
	trainer.train()

	# Save model
	progress(0.9, "Saving model...")
	trainer.model.save_pretrained(output_dir)
	tokenizer.save_pretrained(output_dir)

	# Test with sample prompts
	progress(0.95, "Testing model...")
	test_prompts = [
	"How was the Math exam?",
	"Good morning students! How are you all?",
	"What should you do if you get into a fight with a friend?",
	"Did you complete your science project?",
	"What did you learn in class today?"
	]

	# Load the fine-tuned model for inference
	fine_tuned_model = PeftModel.from_pretrained(
	model,
	output_dir,
	device_map="auto",
	)

	# Generate responses
	results = []
	for prompt in test_prompts:
	formatted_prompt = f"<s> [INST] Teacher {prompt} [/INST] Student"
	inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

	with torch.no_grad():
	outputs = fine_tuned_model.generate(
	**inputs,
	max_length=200,
	temperature=0.7,
	top_p=0.95,
	do_sample=True,
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	student_part = response.split("Student")[1].strip() if "Student" in response else response

	results.append({
	"prompt": prompt,
	"response": student_part
	})

	# Save results
	with open("test_results.json", "w") as f:
	json.dump(results, f, indent=2)

	progress(1.0, "Completed!")
	return results

	# Define Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Mistral 7B Fine-Tuning for Student Bot")

	with gr.Tab("System Check"):
	check_btn = gr.Button("Check GPU and Authentication Status")
	system_output = gr.Textbox(label="System Status", lines=5)

	@spaces.GPU
	def check_system():
	status = []
	# Check GPU
	if torch.cuda.is_available():
	status.append(f"✅ GPU AVAILABLE: {torch.cuda.get_device_name(0)}")
	gpu_memory = f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
	status.append(gpu_memory)
	else:
	status.append("❌ NO GPU DETECTED.")

	# Check HF token
	if os.environ.get("HF_TOKEN"):
	status.append("✅ Hugging Face token found")
	else:
	status.append("❌ No Hugging Face token found. You may encounter access issues with gated models.")

	# Check if we can access Mistral model
	try:
	from huggingface_hub import model_info
	info = model_info("mistralai/Mistral-7B-v0.1", token=hf_token)
	status.append(f"✅ Access to Mistral-7B-v0.1 model verified: {info.modelId}")
	except Exception as e:
	status.append(f"❌ Cannot access Mistral-7B-v0.1 model: {str(e)}")

	return "\n".join(status)

	check_btn.click(check_system, inputs=[], outputs=[system_output])

	with gr.Tab("Fine-tune Model"):
	with gr.Row():
	csv_input = gr.File(label="Upload Teacher-Student CSV")

	with gr.Row():
	sample_size = gr.Slider(minimum=10, maximum=1000, value=100, step=10, label="Sample Size")
	epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")

	with gr.Row():
	start_btn = gr.Button("Start Fine-Tuning")

	with gr.Row():
	output = gr.JSON(label="Results")

	start_btn.click(finetune_model, inputs=[csv_input, sample_size, epochs], outputs=[output])

	with gr.Tab("About"):
	gr.Markdown("""
	## Fine-Tuning Mistral 7B for Student Bot

	This app fine-tunes the original Mistral-7B-v0.1 model to respond like a student to teacher prompts.

	### Requirements
	- CSV file with teacher-student conversation pairs
	- GPU acceleration (provided by this Space)
	- Hugging Face authentication for accessing Mistral-7B-v0.1 (which is a gated model)

	### Process
	1. Upload your CSV file
	2. Set sample size and number of epochs
	3. Click "Start Fine-Tuning"
	4. View test results with sample prompts

	### Important Notes
	- Fine-tuning can take several hours depending on your sample size and epochs
	- The model will be saved in the Space and can be downloaded for further use
	""")

	# Launch app
	demo.launch()