Spaces:
Sleeping
Sleeping
# run as a module using: python3 -m scripts.finetune | |
# Using: https://huggingface.co/blog/mlabonne/sft-llama3 | |
import torch | |
from trl import SFTTrainer | |
from datasets import load_dataset | |
from transformers import TrainingArguments, TextStreamer | |
from unsloth.chat_templates import get_chat_template | |
from unsloth import FastLanguageModel, is_bfloat16_supported | |
from data.fine_tune_dataset import load_data | |
def finetune(model="unsloth/Meta-Llama-3.1-8B-bnb-4bit", dataset="mlabonne/FineTome-100k"): | |
hf_token = "" | |
# Loading the model and restricting context window | |
max_seq_length = 2048 | |
model, tokenizer = FastLanguageModel.from_pretrained( | |
model_name=model, | |
max_seq_length=max_seq_length, | |
load_in_4bit=True, | |
dtype=None, | |
) | |
# Loading prepared dataset | |
dataset = load_data(dataset, tokenizer) | |
# Loading the model for fine tuning - only set to FT 42million/8billion parameters | |
model = FastLanguageModel.get_peft_model( | |
model, | |
r=16, # rank determines LoRA (Low rank adaptation - freezing much of the model for fine tuning) matrix size, higher increases memory and compute cost | |
lora_alpha=16, # scaling factor for updates | |
lora_dropout=0, # not used for speedup | |
target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], # where LoRA targets | |
use_rslora=True, # rank stabilised | |
use_gradient_checkpointing="unsloth" | |
) | |
# Saving the untrained model, save_method can be lora to only save adapters or merged (16 or 4 bit) | |
model.save_pretrained_merged("models/PreFineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally | |
model.push_to_hub_merged("thebigoed/PreFineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit") | |
trainer=SFTTrainer( | |
model=model, | |
tokenizer=tokenizer, | |
train_dataset=dataset, | |
dataset_text_field="text", | |
max_seq_length=max_seq_length, | |
dataset_num_proc=2, | |
packing=True, | |
args=TrainingArguments( | |
learning_rate=3e-4, # to low = slow and local minima, too high = unstable | |
lr_scheduler_type="linear", # adjusts the learning rate (linear and cosine are most popular) | |
per_device_train_batch_size=8, | |
gradient_accumulation_steps=2, | |
num_train_epochs=1, | |
fp16=not is_bfloat16_supported(), | |
bf16=is_bfloat16_supported(), | |
logging_steps=1, | |
optim="adamw_8bit", | |
weight_decay=0.01, | |
warmup_steps=10, | |
output_dir="output", | |
seed=0, | |
), | |
) | |
trainer.train() | |
# Saving the model, save_method can be lora to only save adapters or merged (16 or 4 bit) | |
model.save_pretrained_merged("models/FineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally | |
model.push_to_hub_merged("thebigoed/FineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit") | |
# Use to save in GGUF quantised format | |
# quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"] | |
# for quant in quant_methods: | |
# model.push_to_hub_gguf("", tokenizer, quant) | |
return | |
if __name__ == "__main__": | |
finetune() | |