dsmueller's picture
Add new files and update dependencies
9e70bac
raw
history blame
3.6 kB
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
import os
from uuid import uuid4
import pandas as pd
import subprocess
from transformers import AutoModelForCausalLM, AutoTokenizer
def max_token_len(dataset):
max_seq_length = 0
for row in dataset:
tokens = len(tokenizer(row['text'])['input_ids'])
if tokens > max_seq_length:
max_seq_length = tokens
return max_seq_length
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
model_name = 'mistralai/Mistral-7B-v0.1'
# model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)
# dataset = load_dataset("imdb", split="train")
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset = load_dataset(dataset_name, split="train")
# Write dataset files into data directory
data_directory = './fine_tune_data/'
# Create the data directory if it doesn't exist
os.makedirs(data_directory, exist_ok=True)
# Write the train data to a CSV file
train_data='train_data'
train_filename = os.path.join(data_directory, train_data)
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'])
print('Max token length train: '+str(max_token_length_train))
# Write the validation data to a CSV file
validation_data='validation_data'
validation_filename = os.path.join(data_directory, validation_data)
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'])
print('Max token length validation: '+str(max_token_length_validation))
max_token_length=max(max_token_length_train,max_token_length_validation)
if max_token_length > model_max_length:
raise ValueError("Maximum token length exceeds model limits.")
block_size=2*max_token_length
print('Block size: '+str(block_size))
# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())
model_params={
"project_name": project_name,
"model_name": model_name,
"repo_id": username+'/'+repo_name,
"train_data": train_data,
"validation_data": validation_data,
"data_directory": data_directory,
"block_size": block_size,
"model_max_length": max_token_length,
"logging_steps": -1,
"evaluation_strategy": "epoch",
"save_total_limit": 1,
"save_strategy": "epoch",
"mixed_precision": "fp16",
"lr": 0.00003,
"epochs": 3,
"batch_size": 2,
"warmup_ratio": 0.1,
"gradient_accumulation": 1,
"optimizer": "adamw_torch",
"scheduler": "linear",
"weight_decay": 0,
"max_grad_norm": 1,
"seed": 42,
"quantization": "int4",
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05
}
for key, value in model_params.items():
os.environ[key] = str(value)
print(model_params)
### Load model
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True
)
### Start trainer
# trainer = SFTTrainer(
# model_name,
# train_dataset=dataset,
# dataset_text_field="text",
# max_seq_length=512,
# )
peft_config = LoraConfig(
r=model_params['lora_r'],
lora_alpha=model_params['lora_alpha'],
lora_dropout=model_params['lora_dropout']
)
trainer = SFTTrainer(
model,
train_dataset=dataset,
dataset_text_field="text",
peft_config=peft_config,
max_seq_length=model_params['model_max_length']
)
trainer.train()