In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")

login(token = hf_token)

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on Mathematical Word Problems', 
    job_type="training", 
    anonymous="allow"
)

In [3]:
base_model = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"
dataset_name = "microsoft/orca-math-word-problems-200k"
new_model = "llama-3.1-8b-chat-math-teacher"

In [4]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [5]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

In [9]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
# Importing the dataset
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(1000)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["question"]},
               {"role": "assistant", "content": row["answer"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

In [11]:
dataset = dataset.train_test_split(test_size=0.2)

In [18]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [20]:
history = trainer.train()

Step,Training Loss,Validation Loss
80,0.4385,0.620739
160,0.3962,0.606882
240,0.6064,0.591038
320,0.5653,0.58397
400,0.944,0.57697




In [21]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.028 MB of 0.028 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▆▃▂▁
eval/runtime,▃▅▆▁█
eval/samples_per_second,▁▁▁▁▁
eval/steps_per_second,▁▁▁▁▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇█████
train/global_step,▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▄▃▅▃▃▁▂▃▂▂▃▁▂▃▂▃▅▇▂▂▂▄█▂▂▃▄▃▅▇▂▂▃▃▃▁▁▂▃▅
train/learning_rate,▂████▇▇▇▇▇▆▆▆▆▆▆▆▆▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▁▁▁
train/loss,▅▆▄▅▂▅▂▂▅▅▃▄▅▃▅▅▂▃▄▂▄▇▃▁▄▃▇▂▃▃▂▃▃▁▃▆▁▄▁█

0,1
eval/loss,0.57697
eval/runtime,195.8812
eval/samples_per_second,1.021
eval/steps_per_second,1.021
total_flos,1.1602099108503552e+16
train/epoch,1.0
train/global_step,400.0
train/grad_norm,1.6999
train/learning_rate,0.0
train/loss,0.944


In [29]:
messages = [
    {
        "role": "user",
        "content": "This year, the age difference between Parisa and her mother is 40 years, and after 15 years, the age of her mother will be three times that of Parisa. Find the age of Parisa this year."
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=300, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


Let's denote Parisa's current age as P and her mother's current age as M.

According to the first condition, the age difference between Parisa and her mother is 40 years, so we can write:

M = P + 40  (1)

According to the second condition, after 15 years, the age of her mother will be three times that of Parisa. So we can write:

M + 15 = 3 * (P + 15)  (2)

Now, let's substitute the expression for M from equation (1) into equation (2):

(P + 40) + 15 = 3 * (P + 15)

Now, let's solve for P:

P + 40 + 15 = 3P + 45

Combine like terms:

P + 55 = 3P + 45

Subtract P from both sides:

55 = 2P + 45

Subtract 45 from both sides:

10 = 2P

Divide both sides by 2:

P = 5

So, Parisa is currently 5 years old.



In [None]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)