Spaces:

kalekarnn
/

fine-tuned-phi-2-model

Running

App Files Files Community

fine-tuned-phi-2-model / train.py

kalekarnn

Upload 3 files

24a6b10 verified 25 days ago

raw

history blame contribute delete

4.48 kB

	from datasets import load_dataset, Dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from transformers import TrainingArguments
	from trl import SFTTrainer, SFTConfig
	from peft import LoraConfig, prepare_model_for_kbit_training
	import torch

	# Configure quantization
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	)

	# Load model and tokenizer
	model_name = "microsoft/phi-2"
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=bnb_config,
	device_map="auto",
	trust_remote_code=True
	)
	model.config.use_cache = False

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token

	# Prepare model for k-bit training
	model = prepare_model_for_kbit_training(model)

	# Configure LoRA
	peft_config = LoraConfig(
	r=16,
	lora_alpha=32,
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=["q_proj", "k_proj", "v_proj", "dense"]
	)

	# Load and preprocess dataset
	ds = load_dataset("OpenAssistant/oasst1")
	train_dataset = ds['train']

	def format_conversation(example):
	"""Format the conversation for instruction fine-tuning"""
	# Only process root messages (start of conversations)
	if example["role"] == "prompter" and example["parent_id"] is None:
	conversation = []
	current_msg = example
	conversation.append(("Human", current_msg["text"]))

	# Follow the conversation thread
	current_id = current_msg["message_id"]
	while current_id in message_children:
	# Get the next message in conversation
	next_msg = message_children[current_id]
	if next_msg["role"] == "assistant":
	conversation.append(("Assistant", next_msg["text"]))
	elif next_msg["role"] == "prompter":
	conversation.append(("Human", next_msg["text"]))
	current_id = next_msg["message_id"]

	if len(conversation) >= 2: # At least one exchange (human->assistant)
	formatted_text = ""
	for speaker, text in conversation:
	formatted_text += f"{speaker}: {text}\n\n"
	return {"text": formatted_text.strip()}
	return {"text": None}

	# Build message relationships
	print("Building conversation threads...")
	message_children = {}
	for example in train_dataset:
	if example["parent_id"] is not None:
	message_children[example["parent_id"]] = example

	# Format complete conversations
	print("\nFormatting conversations...")
	processed_dataset = []
	for example in train_dataset:
	result = format_conversation(example)
	if result["text"] is not None:
	processed_dataset.append(result)
	if len(processed_dataset) % 100 == 0 and len(processed_dataset) > 0:
	print(f"Found {len(processed_dataset)} valid conversations")

	print(f"Final dataset size: {len(processed_dataset)} conversations")

	# Convert to Dataset format
	train_dataset = Dataset.from_list(processed_dataset)

	# Remove the redundant conversion
	# train_dataset = list(train_dataset)
	# train_dataset = Dataset.from_list(train_dataset)

	# Convert to standard dataset for training
	train_dataset = list(train_dataset)
	train_dataset = Dataset.from_list(train_dataset)

	# Configure SFT parameters
	sft_config = SFTConfig(
	output_dir="phi2-finetuned",
	num_train_epochs=1,
	max_steps=500,
	per_device_train_batch_size=4,
	gradient_accumulation_steps=1,
	learning_rate=2e-4,
	weight_decay=0.001,
	logging_steps=1,
	logging_strategy="steps",
	save_strategy="steps",
	save_steps=100,
	save_total_limit=3,
	push_to_hub=False,
	max_seq_length=512,
	report_to="none",
	)

	# Initialize trainer
	trainer = SFTTrainer(
	model=model,
	train_dataset=train_dataset, # Changed from dataset to train_dataset
	peft_config=peft_config,
	args=sft_config,
	)

	# Train the model
	trainer.train()

	# Save the trained model in Hugging Face format
	trainer.save_model("phi2-finetuned-final")

	# Save the model in PyTorch format
	model_save_path = "phi2-finetuned-final/model.pt"
	torch.save({
	'model_state_dict': trainer.model.state_dict(),
	'config': trainer.model.config,
	'peft_config': peft_config,
	}, model_save_path)
	print(f"Model saved in PyTorch format at: {model_save_path}")