storage / prompting /.ipynb_checkpoints /train_phi-checkpoint.py

Upload folder using huggingface_hub

2cddd11 verified about 1 year ago

2.86 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
	from huggingface_hub import ModelCard, ModelCardData, HfApi
	from datasets import load_dataset
	from jinja2 import Template
	from trl import SFTTrainer
	import yaml
	import torch

	# Model Configs
	MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
	NEW_MODEL_NAME = "opus-phi-3-mini-4k-instruct"
	CACHE_DIR = "./../cache"

	# Dataset Configs
	DATASET_NAME = ""
	SPLIT = "train"

	# the maximum length of the sequences that the model will handle
	MAX_SEQ_LENGTH = 4096
	num_train_epochs = 1
	license = "apache-2.0"
	username = "darshanmakwana412"
	learning_rate = 1.41e-5
	per_device_train_batch_size = 4
	gradient_accumulation_steps = 1

	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
	dataset = load_dataset(DATASET_NAME, split=SPLIT)

	# EOS Token is used to mark the end of a sentence
	EOS_TOKEN=tokenizer.eos_token_id

	def formatting_prompts_func(examples):
	# Extract the conversations from the examples.
	convos = examples["conversations"]
	# Initialize an empty list to store the formatted texts.
	texts = []
	# Define a dictionary to map the 'from' field in the conversation to a prefix.
	mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
	# Define a dictionary to map the 'from' field in the conversation to a suffix.
	end_mapper = {"system": "", "human": "", "gpt": ""}
	# Iterate over each conversation.
	for convo in convos:
	# Format the conversation by joining each turn with its corresponding prefix and suffix.
	# Append the EOS token to the end of the conversation.
	text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
	texts.append(f"{text}{EOS_TOKEN}")
	# Return the formatted texts.
	return {"text": texts}

	dataset = dataset.map(formatting_prompts_func, batched=True)

	args = TrainingArguments(
	evaluation_strategy="steps",
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	gradient_checkpointing=True,
	learning_rate=learning_rate,
	fp16 = not torch.cuda.is_bf16_supported(),
	bf16 = torch.cuda.is_bf16_supported(),
	max_steps=-1,
	num_train_epochs=num_train_epochs,
	save_strategy="epoch",
	logging_steps=10,
	output_dir=NEW_MODEL_NAME,
	optim="paged_adamw_32bit",
	lr_scheduler_type="linear"
	)

	trainer = SFTTrainer(
	model=model,
	args=args,
	train_dataset=dataset,
	dataset_text_field="text",
	max_seq_length=MAX_SEQ_LENGTH,
	formatting_func=formatting_prompts_func
	)

	import gc
	import os

	gc.collect()
	torch.cuda.empty_cache()

	trainer.train()