Spaces:

amaltese
/

avatartestspace

Runtime error

App Files Files Community

avatartestspace / app.py

amaltese

Update app.py

1872e0d verified 5 months ago

raw

history blame

9.42 kB

	import gradio as gr
	import pandas as pd
	import torch
	import os
	from datasets import Dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
	from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
	import spaces # Import the spaces library

	# Initialize logging
	import logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Function to load and process data
	def load_data(csv_file):
	try:
	df = pd.read_csv(csv_file)
	logger.info(f"CSV columns: {df.columns.tolist()}")
	logger.info(f"Total rows in CSV: {len(df)}")
	return df
	except Exception as e:
	logger.error(f"Error loading CSV: {e}")
	return None

	# Function to prepare dataset
	def prepare_dataset(df, teacher_col, student_col, num_samples=100):
	# Extract and format data
	logger.info(f"Using columns: {teacher_col} (teacher) and {student_col} (student)")

	formatted_data = []
	for i in range(min(num_samples, len(df))):
	teacher_text = str(df.iloc[i][teacher_col])
	student_text = str(df.iloc[i][student_col])

	# Create prompt
	formatted_text = f"### Teacher: {teacher_text}\n### Student: {student_text}"
	formatted_data.append({"text": formatted_text})

	logger.info(f"Created {len(formatted_data)} formatted examples")

	# Create dataset
	dataset = Dataset.from_list(formatted_data)

	# Split dataset
	train_val_split = dataset.train_test_split(test_size=0.1, seed=42)

	return train_val_split

	# Function to tokenize data
	def tokenize_data(dataset, tokenizer, max_length=512):
	def tokenize_function(examples):
	return tokenizer(
	examples["text"],
	truncation=True,
	max_length=max_length,
	padding="max_length"
	)

	tokenized_dataset = dataset.map(tokenize_function, batched=True)
	return tokenized_dataset

	# Main fine-tuning function with memory optimizations
	def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None):
	"""
	Fine-tune a model with optimized memory settings to prevent CUDA OOM errors.
	"""
	logger.info(f"Using model: {model_id}")

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# ============ MEMORY OPTIMIZATION 1: REDUCED BATCH SIZE ============
	# A smaller batch size dramatically reduces memory usage during training
	actual_batch_size = 8 if batch_size is None else min(batch_size, 8)
	logger.info(f"Using batch size: {actual_batch_size} (reduced from original to save memory)")

	# ============ MEMORY OPTIMIZATION 2: 8-bit QUANTIZATION ============
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	load_in_8bit=True, # Use 8-bit quantization to reduce memory usage
	device_map="auto", # Automatically handle model distribution
	use_cache=False, # Disable KV cache which uses extra memory
	torch_dtype=torch.float16, # Use lower precision
	)

	# Count model parameters
	logger.info(f"Model parameters: {model.num_parameters():,}")

	# Prepare model for training with quantization
	model = prepare_model_for_kbit_training(model)

	# ============ MEMORY OPTIMIZATION 3: GRADIENT CHECKPOINTING ============
	model.gradient_checkpointing_enable()
	logger.info("Gradient checkpointing enabled: trading computation for memory savings")

	# ============ MEMORY OPTIMIZATION 4: OPTIMIZED LORA CONFIG ============
	peft_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	inference_mode=False,
	r=4, # REDUCED from default 8/16 to save memory
	lora_alpha=16, # Scaling factor
	lora_dropout=0.1, # Dropout probability for regularization
	target_modules=["q_proj", "v_proj"], # Only attention query and value projections
	)
	logger.info("Using optimized LoRA parameters with reduced rank (r=4) and targeted modules")

	# Apply LoRA adapters to the model
	model = get_peft_model(model, peft_config)
	model.print_trainable_parameters() # Print trainable parameters info

	# Define training arguments
	training_args = TrainingArguments(
	output_dir=output_dir,
	num_train_epochs=epochs,
	# ============ MEMORY OPTIMIZATION 5: REDUCED BATCH SIZE IN ARGS ============
	per_device_train_batch_size=actual_batch_size,
	per_device_eval_batch_size=actual_batch_size,
	# ============ MEMORY OPTIMIZATION 6: MIXED PRECISION TRAINING ============
	fp16=True, # Use FP16 for mixed precision training
	# ============ MEMORY OPTIMIZATION 7: GRADIENT ACCUMULATION ============
	gradient_accumulation_steps=4, # Accumulate gradients over 4 steps
	# ============ MEMORY OPTIMIZATION 8: GRADIENT CHECKPOINTING IN ARGS ============
	gradient_checkpointing=True,
	# Other parameters
	logging_steps=10,
	save_strategy="epoch",
	evaluation_strategy="epoch",
	learning_rate=2e-4,
	weight_decay=0.01,
	warmup_ratio=0.03,
	# ============ MEMORY OPTIMIZATION 9: REDUCED OPTIMIZER OVERHEAD ============
	optim="adamw_torch_fused", # More memory-efficient optimizer
	# ============ MEMORY OPTIMIZATION 10: REDUCED LOGGING MEMORY ============
	report_to="none", # Disable extra logging to save memory
	)

	# Initialize the Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_data["train"],
	eval_dataset=train_data["validation"],
	tokenizer=tokenizer,
	)

	# ============ MEMORY OPTIMIZATION 11: MANAGE CUDA CACHE ============
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	logger.info("CUDA cache cleared before training")

	# Start training
	logger.info("Starting training...")
	trainer.train()

	# Save the model
	model.save_pretrained(output_dir)
	tokenizer.save_pretrained(output_dir)
	logger.info(f"Model saved to {output_dir}")

	return model, tokenizer

	# Gradio interface functions
	def process_csv(file, teacher_col, student_col, num_samples):
	df = load_data(file.name)
	if df is None:
	return "Error loading CSV file"
	return f"CSV loaded successfully with {len(df)} rows"

	def start_fine_tuning(file, teacher_col, student_col, model_id, epochs, batch_size, num_samples):
	try:
	# Load and process data
	df = load_data(file.name)
	if df is None:
	return "Error loading CSV file"

	# Prepare dataset
	dataset = prepare_dataset(df, teacher_col, student_col, num_samples=int(num_samples))

	# Load tokenizer for preprocessing
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Tokenize dataset
	tokenized_dataset = {
	"train": tokenize_data(dataset["train"], tokenizer),
	"validation": tokenize_data(dataset["test"], tokenizer),
	}

	# Create output directory
	output_dir = "./fine_tuned_model"
	os.makedirs(output_dir, exist_ok=True)

	# Finetune model with memory optimizations
	finetune_model(
	model_id=model_id,
	train_data=tokenized_dataset,
	output_dir=output_dir,
	epochs=int(epochs),
	batch_size=int(batch_size),
	)

	return "Fine-tuning completed successfully!"

	except Exception as e:
	logger.error(f"Error during fine-tuning: {e}")
	return f"Error during fine-tuning: {str(e)}"

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Teacher-Student Bot Fine-Tuning")

	with gr.Tab("Upload Data"):
	file_input = gr.File(label="Upload CSV File")
	with gr.Row():
	teacher_col = gr.Textbox(label="Teacher Column", value="Unnamed: 0")
	student_col = gr.Textbox(label="Student Column", value="idx")
	num_samples = gr.Slider(label="Number of Samples", minimum=10, maximum=1000, value=100, step=10)
	upload_btn = gr.Button("Process CSV")
	csv_output = gr.Textbox(label="CSV Processing Result")
	upload_btn.click(process_csv, inputs=[file_input, teacher_col, student_col, num_samples], outputs=csv_output)

	with gr.Tab("Fine-Tune"):
	model_id = gr.Textbox(label="Model ID", value="mistralai/Mistral-7B-v0.1")
	with gr.Row():
	batch_size = gr.Number(label="Batch Size", value=8, info="Recommended: 8 or lower for 7B models")
	epochs = gr.Number(label="Number of Epochs", value=2)

	training_btn = gr.Button("Start Fine-Tuning")
	training_output = gr.Textbox(label="Training Progress")

	training_btn.click(
	start_fine_tuning,
	inputs=[file_input, teacher_col, student_col, model_id, epochs, batch_size, num_samples],
	outputs=training_output
	)

	# Launch the app - REMOVED the spaces.zero.mount() call that was causing the error
	demo.queue().launch(debug=True)