File size: 4,146 Bytes
2cddd11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)
from trl import SFTTrainer, SFTConfig
from random import randrange
import torch
import wandb

cache_dir = "./../cache"
model_id = "microsoft/Phi-3-mini-4k-instruct"
new_model = "python-phi-3-mini-4k-instruct"
username = "ellipticaloranges"
device_map = {"": 0}
hf_model_repo = username + "/" + new_model

## ------------------------LoRA Configs------------------------------------------------------

lora_r = 16
lora_alpha = 16
lora_dropout = 0.05
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]

## ------------------------------------------------------------------------------------------

dataset_name = "flytech/python-codes-25k"
dataset_split= "train"

dataset = load_dataset(dataset_name, split=dataset_split, cache_dir=cache_dir)
print(f"Dataset size: {len(dataset)}")


tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True, add_eos_token=True, use_fast=True)
# The padding token is set to the unknown token.
tokenizer.pad_token = tokenizer.unk_token
# The ID of the padding token is set to the ID of the unknown token.
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
# ValueError: You are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input.
tokenizer.padding_side = 'left'


def create_message_column(row):
    messages = []
    user = {
        "content": f"{row['instruction']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['input']}\n{row['output']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)

# print("Max Seq Length", max(map(lambda x: len(tokenizer.encode(x["text"])), dataset)))

if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
    attn_implementation = 'flash_attention_2'
else:
    compute_dtype = torch.float16
    attn_implementation = 'sdpa'
    
print(f"Using {compute_dtype} with {attn_implementation} implementation")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype = compute_dtype,
    trust_remote_code = True,
    device_map = device_map,
    attn_implementation = attn_implementation,
    cache_dir = cache_dir
)

args = SFTConfig(
    output_dir="./phi-3-mini-LoRA",
    eval_strategy="steps",
    do_eval=True,
    optim="adamw_torch",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    log_level="debug",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=1e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    eval_steps=100,
    dataset_text_field="text",
    max_seq_length=512,
    num_train_epochs=3,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    report_to="wandb",
    seed=42,
)

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules,
)

model.add_adapter(peft_config)

wandb.init(project = "Phi 3", name = "python-phi-3-lora")

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_chatml['train'],
    eval_dataset=dataset_chatml['test'],
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=args,
)

trainer.train()

# Save the model to the `output_dir` after training
model.save_pretrained("./out/")