|
from datasets import load_dataset |
|
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
BitsAndBytesConfig, |
|
TrainingArguments, |
|
set_seed, |
|
pipeline |
|
) |
|
from trl import SFTTrainer, SFTConfig |
|
from random import randrange |
|
import torch |
|
import wandb |
|
|
|
cache_dir = "./../cache" |
|
model_id = "microsoft/Phi-3-mini-4k-instruct" |
|
new_model = "python-phi-3-mini-4k-instruct" |
|
username = "ellipticaloranges" |
|
device_map = {"": 0} |
|
hf_model_repo = username + "/" + new_model |
|
|
|
|
|
|
|
lora_r = 16 |
|
lora_alpha = 16 |
|
lora_dropout = 0.05 |
|
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] |
|
|
|
|
|
|
|
dataset_name = "flytech/python-codes-25k" |
|
dataset_split= "train" |
|
|
|
dataset = load_dataset(dataset_name, split=dataset_split, cache_dir=cache_dir) |
|
print(f"Dataset size: {len(dataset)}") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True, add_eos_token=True, use_fast=True) |
|
|
|
tokenizer.pad_token = tokenizer.unk_token |
|
|
|
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) |
|
|
|
tokenizer.padding_side = 'left' |
|
|
|
|
|
def create_message_column(row): |
|
messages = [] |
|
user = { |
|
"content": f"{row['instruction']}", |
|
"role": "user" |
|
} |
|
messages.append(user) |
|
assistant = { |
|
"content": f"{row['input']}\n{row['output']}", |
|
"role": "assistant" |
|
} |
|
messages.append(assistant) |
|
return {"messages": messages} |
|
|
|
def format_dataset_chatml(row): |
|
return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)} |
|
|
|
dataset_chatml = dataset.map(create_message_column) |
|
dataset_chatml = dataset_chatml.map(format_dataset_chatml) |
|
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234) |
|
|
|
|
|
|
|
if torch.cuda.is_bf16_supported(): |
|
compute_dtype = torch.bfloat16 |
|
attn_implementation = 'flash_attention_2' |
|
else: |
|
compute_dtype = torch.float16 |
|
attn_implementation = 'sdpa' |
|
|
|
print(f"Using {compute_dtype} with {attn_implementation} implementation") |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
torch_dtype = compute_dtype, |
|
trust_remote_code = True, |
|
device_map = device_map, |
|
attn_implementation = attn_implementation, |
|
cache_dir = cache_dir |
|
) |
|
|
|
args = SFTConfig( |
|
output_dir="./phi-3-mini-LoRA", |
|
eval_strategy="steps", |
|
do_eval=True, |
|
optim="adamw_torch", |
|
per_device_train_batch_size=8, |
|
gradient_accumulation_steps=4, |
|
per_device_eval_batch_size=8, |
|
log_level="debug", |
|
save_strategy="epoch", |
|
logging_steps=10, |
|
learning_rate=1e-4, |
|
fp16 = not torch.cuda.is_bf16_supported(), |
|
bf16 = torch.cuda.is_bf16_supported(), |
|
eval_steps=100, |
|
dataset_text_field="text", |
|
max_seq_length=512, |
|
num_train_epochs=3, |
|
warmup_ratio=0.1, |
|
lr_scheduler_type="linear", |
|
report_to="wandb", |
|
seed=42, |
|
) |
|
|
|
peft_config = LoraConfig( |
|
r=lora_r, |
|
lora_alpha=lora_alpha, |
|
lora_dropout=lora_dropout, |
|
task_type=TaskType.CAUSAL_LM, |
|
target_modules=target_modules, |
|
) |
|
|
|
model.add_adapter(peft_config) |
|
|
|
wandb.init(project = "Phi 3", name = "python-phi-3-lora") |
|
|
|
trainer = SFTTrainer( |
|
model=model, |
|
train_dataset=dataset_chatml['train'], |
|
eval_dataset=dataset_chatml['test'], |
|
peft_config=peft_config, |
|
tokenizer=tokenizer, |
|
args=args, |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./out/") |