alphapen_trocr_large_70000 / finetuner_usloath.py
hadrakey's picture
Training in progress, step 1000
3f86748 verified
# Example inspired from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct
# Import necessary libraries
from PIL import Image
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoProcessor
from transformers import BitsAndBytesConfig
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
import torch
import pandas as pd
from torchmetrics.text import CharErrorRate
from peft import LoraConfig, get_peft_model
from data import AlphaPenPhi3Dataset
from sklearn.model_selection import train_test_split
from datetime import datetime
import os
import evaluate
# tqdm.pandas()
os.environ["WANDB_PROJECT"]="Alphapen"
# Define model ID
model_id = "microsoft/Phi-3-vision-128k-instruct"
# Load data
df_path = "/mnt/data1/Datasets/AlphaPen/" + "training_data.csv"
df = pd.read_csv(df_path)
df.dropna(inplace=True)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=0)
# we reset the indices to start from zero
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
root_dir = "/mnt/data1/Datasets/OCR/Alphapen/clean_data/final_cropped_rotated_"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
tokenizer = processor.tokenizer
train_dataset = AlphaPenPhi3Dataset(root_dir=root_dir, dataframe=train_df, tokenizer=tokenizer, max_length=128, image_size=128)
eval_dataset = AlphaPenPhi3Dataset(root_dir=root_dir, dataframe=test_df.iloc[:10,], tokenizer=tokenizer, max_length=128, image_size=128)
print(train_dataset[0])
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
# Load model with 4-bit quantization and map to CUDA
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
trust_remote_code=True,
torch_dtype="auto",
quantization_config=nf4_config,
)
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
# model.config.vocab_size = model.config.decoder.vocab_size
# for peft
# model.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_new_tokens= 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
# LoRa
lora_config = LoraConfig(
r=64,
lora_alpha=16,
lora_dropout=0.1,
# target_modules = 'all-linear'
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
# "gate_proj",
# "up_proj",
# "down_proj",
],
)
# print(model)
# import torch
# from transformers import Conv1D
# def get_specific_layer_names(model):
# # Create a list to store the layer names
# layer_names = []
# # Recursively visit all modules and submodules
# for name, module in model.named_modules():
# # Check if the module is an instance of the specified layers
# if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
# # model name parsing
# layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])
# return layer_names
# print(list(set(get_specific_layer_names(model))))
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
model = get_peft_model(model, lora_config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# print(model.vocab_size)
# run_name=f"Mistral-7B-SQL-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}"
# # Step 3: Define the training arguments
training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
evaluation_strategy="steps",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
bf16=True,
bf16_full_eval=True,
output_dir="./",
logging_steps=100,
save_steps=1000,
eval_steps=100,
report_to="wandb",
run_name=f"phi3-vision-LoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
optim="adamw_torch_fused",
lr_scheduler_type="cosine",
gradient_accumulation_steps=2,
learning_rate=1.0e-4,
max_steps=10000,
push_to_hub=True,
hub_model_id="hadrakey/alphapen_phi3",
)
def compute_metrics(pred):
# accuracy_metric = evaluate.load("precision")
cer_metric = evaluate.load("cer")
labels_ids = pred.label_ids
pred_ids = pred.predictions
print(labels_ids.shape, pred_ids.shape)
max_length = max(pred_ids.shape[1], labels_ids.shape[1])
pred_str = processor.batch_decode(pred_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
print(pred_str)
# pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
print(label_str)
cer = cer_metric.compute(predictions=pred_str, references=label_str)
# accuracy = accuracy_metric.compute(predictions=pred_ids.tolist(), references=labels_ids.tolist())
return {"cer": cer}
# # Step 5: Define the Trainer
trainer = Seq2SeqTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator
)
trainer.train()