storage / prompting /train_lora.py
darshanmakwana's picture
Upload folder using huggingface_hub
2cddd11 verified
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
set_seed,
pipeline
)
from trl import SFTTrainer, SFTConfig
from random import randrange
import torch
import wandb
cache_dir = "./../cache"
model_id = "microsoft/Phi-3-mini-4k-instruct"
new_model = "python-phi-3-mini-4k-instruct"
username = "ellipticaloranges"
device_map = {"": 0}
hf_model_repo = username + "/" + new_model
## ------------------------LoRA Configs------------------------------------------------------
lora_r = 16
lora_alpha = 16
lora_dropout = 0.05
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
## ------------------------------------------------------------------------------------------
dataset_name = "flytech/python-codes-25k"
dataset_split= "train"
dataset = load_dataset(dataset_name, split=dataset_split, cache_dir=cache_dir)
print(f"Dataset size: {len(dataset)}")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True, add_eos_token=True, use_fast=True)
# The padding token is set to the unknown token.
tokenizer.pad_token = tokenizer.unk_token
# The ID of the padding token is set to the ID of the unknown token.
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
# ValueError: You are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to call `tokenizer.padding_side = 'left'` before tokenizing the input.
tokenizer.padding_side = 'left'
def create_message_column(row):
messages = []
user = {
"content": f"{row['instruction']}",
"role": "user"
}
messages.append(user)
assistant = {
"content": f"{row['input']}\n{row['output']}",
"role": "assistant"
}
messages.append(assistant)
return {"messages": messages}
def format_dataset_chatml(row):
return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)
# print("Max Seq Length", max(map(lambda x: len(tokenizer.encode(x["text"])), dataset)))
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
print(f"Using {compute_dtype} with {attn_implementation} implementation")
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype = compute_dtype,
trust_remote_code = True,
device_map = device_map,
attn_implementation = attn_implementation,
cache_dir = cache_dir
)
args = SFTConfig(
output_dir="./phi-3-mini-LoRA",
eval_strategy="steps",
do_eval=True,
optim="adamw_torch",
per_device_train_batch_size=8,
gradient_accumulation_steps=4,
per_device_eval_batch_size=8,
log_level="debug",
save_strategy="epoch",
logging_steps=10,
learning_rate=1e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
eval_steps=100,
dataset_text_field="text",
max_seq_length=512,
num_train_epochs=3,
warmup_ratio=0.1,
lr_scheduler_type="linear",
report_to="wandb",
seed=42,
)
peft_config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
task_type=TaskType.CAUSAL_LM,
target_modules=target_modules,
)
model.add_adapter(peft_config)
wandb.init(project = "Phi 3", name = "python-phi-3-lora")
trainer = SFTTrainer(
model=model,
train_dataset=dataset_chatml['train'],
eval_dataset=dataset_chatml['test'],
peft_config=peft_config,
tokenizer=tokenizer,
args=args,
)
trainer.train()
# Save the model to the `output_dir` after training
model.save_pretrained("./out/")