File size: 2,857 Bytes

2cddd11

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from huggingface_hub import ModelCard, ModelCardData, HfApi
from datasets import load_dataset
from jinja2 import Template
from trl import SFTTrainer
import yaml
import torch

# Model Configs
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "opus-phi-3-mini-4k-instruct"
CACHE_DIR = "./../cache"

# Dataset Configs
DATASET_NAME = ""
SPLIT = "train"

# the maximum length of the sequences that the model will handle
MAX_SEQ_LENGTH = 4096
num_train_epochs = 1
license = "apache-2.0"
username = "darshanmakwana412"
learning_rate = 1.41e-5
per_device_train_batch_size = 4
gradient_accumulation_steps = 1

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
dataset = load_dataset(DATASET_NAME, split=SPLIT)

# EOS Token is used to mark the end of a sentence
EOS_TOKEN=tokenizer.eos_token_id

def formatting_prompts_func(examples):
    # Extract the conversations from the examples.
    convos = examples["conversations"]
    # Initialize an empty list to store the formatted texts.
    texts = []
    # Define a dictionary to map the 'from' field in the conversation to a prefix.
    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
    # Define a dictionary to map the 'from' field in the conversation to a suffix.
    end_mapper = {"system": "", "human": "", "gpt": ""}
    # Iterate over each conversation.
    for convo in convos:
        # Format the conversation by joining each turn with its corresponding prefix and suffix.
        # Append the EOS token to the end of the conversation.
        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
        texts.append(f"{text}{EOS_TOKEN}")
    # Return the formatted texts.
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

args = TrainingArguments(
    evaluation_strategy="steps",
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    learning_rate=learning_rate,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    max_steps=-1,
    num_train_epochs=num_train_epochs,
    save_strategy="epoch",
    logging_steps=10,
    output_dir=NEW_MODEL_NAME,
    optim="paged_adamw_32bit",
    lr_scheduler_type="linear"
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    formatting_func=formatting_prompts_func
)

import gc
import os

gc.collect()
torch.cuda.empty_cache()

trainer.train()