Install required packages

In [82]:
from IPython.display import clear_output

!pip install -qqq --upgrade bitsandbytes transformers peft accelerate datasets trl flash_attn
!pip install huggingface_hub
!pip install python-dotenv
!pip install absl-py nltk rouge_score
!pip list | grep transformers.

clear_output()

Import packages

In [83]:
import numpy as np
import torch
import pandas as pd
from datasets import Dataset, load_metric
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel, get_peft_model
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed, pipeline)
from trl import SFTTrainer

Define parameters

In [84]:
# Name of the model to use as parent model
base_model_name = "microsoft/Phi-3-mini-4k-instruct"

# Name of the new model
model_name="acorreal/project-management-tutor"

# Set the model configuration
use_4bit = True
bnb_4bit_quant_type = "nf4"
use_double_quant = True
set_seed(1234)

Connect to Huggingface Hub

In [85]:
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HF_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Load the dataset with the instruction set

In [86]:
df = pd.read_csv('dataset.csv')
df.columns = ['question', 'answer']
df['instruction'] = "You are an expert in project management, tasked with helping students master project management principles, including the Unified Process. Provide guidance on planning, executing, and closing projects, covering key concepts like scope, time, cost, quality, risk management, and stakeholder engagement. Use practical examples, case studies, and interactive discussions to enhance their knowledge and application of these skills in real-world scenarios.:\n\n"
df.head()

Unnamed: 0,question,answer,instruction
0,"What is the primary focus of the book ""CogiMes...",The book focuses on teaching AI model engineer...,"You are an expert in project management, taske..."
1,"Who is Aria Gray, and what is her role at Nebu...",Aria Gray is a fresh graduate in Computational...,"You are an expert in project management, taske..."
2,"What is MLflow, and why is it important in AI ...",MLflow is a platform that manages the machine ...,"You are an expert in project management, taske..."
3,"What is MLOps, and how does it relate to AI de...",MLOps is a practice for collaboration between ...,"You are an expert in project management, taske..."
4,What are the four main components of MLflow?,The four main components of MLflow are Trackin...,"You are an expert in project management, taske..."


In [87]:
# Load the dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['question', 'answer', 'instruction'],
    num_rows: 1042
})

In [88]:
print(dataset[1])

{'question': 'Who is Aria Gray, and what is her role at NebulaTech?', 'answer': 'Aria Gray is a fresh graduate in Computational Linguistics starting her career at NebulaTech, where she explores various AI and ML concepts.', 'instruction': 'You are an expert in project management, tasked with helping students master project management principles, including the Unified Process. Provide guidance on planning, executing, and closing projects, covering key concepts like scope, time, cost, quality, risk management, and stakeholder engagement. Use practical examples, case studies, and interactive discussions to enhance their knowledge and application of these skills in real-world scenarios.:\n\n'}


Load the tokenizer to prepare the dataset

In [89]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.padding_side = 'right' # to prevent warnings

Function to generate the suitable format for our model.

In [90]:
def create_message_column(row):
    messages = []
    user = {
        "content": f"{row['instruction']}\n Input: {row['question']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['answer']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

Implement the ChatML format on our dataset.

In [91]:
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)

Map:   0%|          | 0/1042 [00:00<?, ? examples/s]

Map:   0%|          | 0/1042 [00:00<?, ? examples/s]

Print example

In [92]:
dataset_chatml[0]

{'question': 'What is the primary focus of the book "CogiMesh, Nexing, AdaptScenes, and the Unified Model Engineering Process (UMEP)"?',
 'answer': 'The book focuses on teaching AI model engineering using the Unified Model Engineering Process (UMEP), a methodology designed for AI applications.',
 'instruction': 'You are an expert in project management, tasked with helping students master project management principles, including the Unified Process. Provide guidance on planning, executing, and closing projects, covering key concepts like scope, time, cost, quality, risk management, and stakeholder engagement. Use practical examples, case studies, and interactive discussions to enhance their knowledge and application of these skills in real-world scenarios.:\n\n',
 'messages': [{'content': 'You are an expert in project management, tasked with helping students master project management principles, including the Unified Process. Provide guidance on planning, executing, and closing projects

Print dataset

In [93]:
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)
dataset_chatml

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'instruction', 'messages', 'text'],
        num_rows: 989
    })
    test: Dataset({
        features: ['question', 'answer', 'instruction', 'messages', 'text'],
        num_rows: 53
    })
})

Recognize GPU

In [94]:
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'

Load the tokenizer and model to finetune

In [95]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name,
                                          trust_remote_code=True,
                                          add_eos_token=True,
                                          use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'left'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_double_quant,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    quantization_config=bnb_config,
    attn_implementation=attn_implementation
)

model = prepare_model_for_kbit_training(model)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Set up the QLoRA parameters.

In [96]:
print(model)

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3FlashAttention2(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3

We now possess all the necessary components to construct our SFTTrainer and commence the model training.

In [97]:
from transformers import TrainingArguments

# Definir la configuraciÃ³n de LoRA
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    target_modules=[
        "model.layers.0.self_attn.qkv_proj",
        "model.layers.0.self_attn.o_proj",
        "model.layers.0.mlp.gate_up_proj",
        "model.layers.0.mlp.down_proj",
        "model.layers.1.self_attn.qkv_proj",
        "model.layers.1.self_attn.o_proj",
        "model.layers.1.mlp.gate_up_proj",
        "model.layers.1.mlp.down_proj"
    ]
)

# ConfiguraciÃ³n de los argumentos de entrenamiento
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    max_grad_norm=1.0,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    report_to="none",
    no_cuda=True,
    fp16_full_eval=False,
    use_cpu=True
)

# Crear el objeto Trainer para el fine-tuning
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_chatml['train'],
    eval_dataset=dataset_chatml['test'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=args,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]



In [98]:
# Get model modules
count = 0
for name, module in model.named_modules():
    count += 1
    if count < 10:
        print(name)
    else:
        break


model
model.embed_tokens
model.embed_dropout
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.o_proj.base_layer


Initiate the model training process by invoking the train() method on our Trainer instance.

In [99]:
peft_model = get_peft_model(model, peft_config)
peft_model.config.use_cache = False
trainer.train()
trainer.save_model()



Epoch,Training Loss,Validation Loss
1,No log,No log


Login in to Hugging Face

In [100]:
#trainer.hub_model_id = "acorreal/project-management-adapter"
#hf_adapter_repo="acorreal/adapter-phi-3-mini-mental-health"
#trainer.push_to_hub(token='TOKEN')
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/acorreal/results/commit/41ed99d19088771aeac48cf2a0abf10c17e2e26a', commit_message='End of training', commit_description='', oid='41ed99d19088771aeac48cf2a0abf10c17e2e26a', pr_url=None, pr_revision=None, pr_num=None)

### Download model from Huggin Face

In [101]:
%pip install --upgrade transformers huggingface_hub



In [102]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from google.colab import userdata
login(token=userdata.get('HF_TOKEN'))

pm_adapter_name = "acorreal/phi3-project-management-adapter"
pm_model_name = "acorreal/phi3-project-management"

pm_model = AutoModelForCausalLM.from_pretrained(pm_model_name,
                                               trust_remote_code=True,
                                                device_map="auto")

pm_model = PeftModel.from_pretrained(model, pm_adapter_name)
pm_model = pm_model.merge_and_unload()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [103]:
pm_tokenizer = AutoTokenizer.from_pretrained(pm_adapter_name)

In [104]:
pm_model.push_to_hub(pm_model_name)
pm_tokenizer.push_to_hub(pm_model_name)

CommitInfo(commit_url='https://huggingface.co/acorreal/phi3-project-management/commit/cc10becf9c9636142cc63902ac00787cf5cb131c', commit_message='Upload tokenizer', commit_description='', oid='cc10becf9c9636142cc63902ac00787cf5cb131c', pr_url=None, pr_revision=None, pr_num=None)

Evaluation

Retrieve the model and tokenizer from the Hub.

In [105]:
!pip install bitsandbytes



In [125]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from google.colab import userdata

pm_model_name = "acorreal/phi3-project-management"

test_model = AutoModelForCausalLM.from_pretrained(pm_model_name)
test_tokenizer = AutoTokenizer.from_pretrained(pm_model_name, trust_remote_code=True)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
