File size: 2,857 Bytes
2cddd11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from huggingface_hub import ModelCard, ModelCardData, HfApi
from datasets import load_dataset
from jinja2 import Template
from trl import SFTTrainer
import yaml
import torch
# Model Configs
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "opus-phi-3-mini-4k-instruct"
CACHE_DIR = "./../cache"
# Dataset Configs
DATASET_NAME = ""
SPLIT = "train"
# the maximum length of the sequences that the model will handle
MAX_SEQ_LENGTH = 4096
num_train_epochs = 1
license = "apache-2.0"
username = "darshanmakwana412"
learning_rate = 1.41e-5
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
dataset = load_dataset(DATASET_NAME, split=SPLIT)
# EOS Token is used to mark the end of a sentence
EOS_TOKEN=tokenizer.eos_token_id
def formatting_prompts_func(examples):
# Extract the conversations from the examples.
convos = examples["conversations"]
# Initialize an empty list to store the formatted texts.
texts = []
# Define a dictionary to map the 'from' field in the conversation to a prefix.
mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
# Define a dictionary to map the 'from' field in the conversation to a suffix.
end_mapper = {"system": "", "human": "", "gpt": ""}
# Iterate over each conversation.
for convo in convos:
# Format the conversation by joining each turn with its corresponding prefix and suffix.
# Append the EOS token to the end of the conversation.
text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
texts.append(f"{text}{EOS_TOKEN}")
# Return the formatted texts.
return {"text": texts}
dataset = dataset.map(formatting_prompts_func, batched=True)
args = TrainingArguments(
evaluation_strategy="steps",
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
gradient_checkpointing=True,
learning_rate=learning_rate,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
max_steps=-1,
num_train_epochs=num_train_epochs,
save_strategy="epoch",
logging_steps=10,
output_dir=NEW_MODEL_NAME,
optim="paged_adamw_32bit",
lr_scheduler_type="linear"
)
trainer = SFTTrainer(
model=model,
args=args,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=MAX_SEQ_LENGTH,
formatting_func=formatting_prompts_func
)
import gc
import os
gc.collect()
torch.cuda.empty_cache()
trainer.train() |