|
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig |
|
from huggingface_hub import ModelCard, ModelCardData, HfApi |
|
from datasets import load_dataset |
|
from jinja2 import Template |
|
from trl import SFTTrainer |
|
import yaml |
|
import torch |
|
|
|
|
|
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" |
|
NEW_MODEL_NAME = "opus-phi-3-mini-4k-instruct" |
|
CACHE_DIR = "./../cache" |
|
|
|
|
|
DATASET_NAME = "" |
|
SPLIT = "train" |
|
|
|
|
|
MAX_SEQ_LENGTH = 4096 |
|
num_train_epochs = 1 |
|
license = "apache-2.0" |
|
username = "darshanmakwana412" |
|
learning_rate = 1.41e-5 |
|
per_device_train_batch_size = 4 |
|
gradient_accumulation_steps = 1 |
|
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True) |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True) |
|
dataset = load_dataset(DATASET_NAME, split=SPLIT) |
|
|
|
|
|
EOS_TOKEN=tokenizer.eos_token_id |
|
|
|
def formatting_prompts_func(examples): |
|
|
|
convos = examples["conversations"] |
|
|
|
texts = [] |
|
|
|
mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"} |
|
|
|
end_mapper = {"system": "", "human": "", "gpt": ""} |
|
|
|
for convo in convos: |
|
|
|
|
|
text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo) |
|
texts.append(f"{text}{EOS_TOKEN}") |
|
|
|
return {"text": texts} |
|
|
|
dataset = dataset.map(formatting_prompts_func, batched=True) |
|
|
|
args = TrainingArguments( |
|
evaluation_strategy="steps", |
|
per_device_train_batch_size=per_device_train_batch_size, |
|
gradient_accumulation_steps=gradient_accumulation_steps, |
|
gradient_checkpointing=True, |
|
learning_rate=learning_rate, |
|
fp16 = not torch.cuda.is_bf16_supported(), |
|
bf16 = torch.cuda.is_bf16_supported(), |
|
max_steps=-1, |
|
num_train_epochs=num_train_epochs, |
|
save_strategy="epoch", |
|
logging_steps=10, |
|
output_dir=NEW_MODEL_NAME, |
|
optim="paged_adamw_32bit", |
|
lr_scheduler_type="linear" |
|
) |
|
|
|
trainer = SFTTrainer( |
|
model=model, |
|
args=args, |
|
train_dataset=dataset, |
|
dataset_text_field="text", |
|
max_seq_length=MAX_SEQ_LENGTH, |
|
formatting_func=formatting_prompts_func |
|
) |
|
|
|
import gc |
|
import os |
|
|
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
trainer.train() |