from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig from huggingface_hub import ModelCard, ModelCardData, HfApi from datasets import load_dataset from jinja2 import Template from trl import SFTTrainer import yaml import torch # Model Configs MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" NEW_MODEL_NAME = "opus-phi-3-mini-4k-instruct" CACHE_DIR = "./../cache" # Dataset Configs DATASET_NAME = "" SPLIT = "train" # the maximum length of the sequences that the model will handle MAX_SEQ_LENGTH = 4096 num_train_epochs = 1 license = "apache-2.0" username = "darshanmakwana412" learning_rate = 1.41e-5 per_device_train_batch_size = 4 gradient_accumulation_steps = 1 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True) dataset = load_dataset(DATASET_NAME, split=SPLIT) # EOS Token is used to mark the end of a sentence EOS_TOKEN=tokenizer.eos_token_id def formatting_prompts_func(examples): # Extract the conversations from the examples. convos = examples["conversations"] # Initialize an empty list to store the formatted texts. texts = [] # Define a dictionary to map the 'from' field in the conversation to a prefix. mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"} # Define a dictionary to map the 'from' field in the conversation to a suffix. end_mapper = {"system": "", "human": "", "gpt": ""} # Iterate over each conversation. for convo in convos: # Format the conversation by joining each turn with its corresponding prefix and suffix. # Append the EOS token to the end of the conversation. text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo) texts.append(f"{text}{EOS_TOKEN}") # Return the formatted texts. return {"text": texts} dataset = dataset.map(formatting_prompts_func, batched=True) args = TrainingArguments( evaluation_strategy="steps", per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, gradient_checkpointing=True, learning_rate=learning_rate, fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(), max_steps=-1, num_train_epochs=num_train_epochs, save_strategy="epoch", logging_steps=10, output_dir=NEW_MODEL_NAME, optim="paged_adamw_32bit", lr_scheduler_type="linear" ) trainer = SFTTrainer( model=model, args=args, train_dataset=dataset, dataset_text_field="text", max_seq_length=MAX_SEQ_LENGTH, formatting_func=formatting_prompts_func ) import gc import os gc.collect() torch.cuda.empty_cache() trainer.train()