|
import spaces |
|
import gradio as gr |
|
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM |
|
from transformers import DataCollatorForSeq2Seq, AutoConfig |
|
from datasets import load_dataset, concatenate_datasets, load_from_disk, DatasetDict |
|
import traceback |
|
from sklearn.metrics import accuracy_score |
|
import numpy as np |
|
import torch |
|
import os |
|
import evaluate |
|
from huggingface_hub import login |
|
from peft import get_peft_model, LoraConfig |
|
|
|
os.environ['HF_HOME'] = '/data/.huggingface' |
|
''' |
|
lora_config = LoraConfig( |
|
r=16, # Rank of the low-rank adaptation |
|
lora_alpha=32, # Scaling factor |
|
lora_dropout=0.1, # Dropout for LoRA layers |
|
bias="none" # Bias handling |
|
) |
|
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny', num_labels=2, force_download=True) |
|
model = get_peft_model(model, lora_config) |
|
model.gradient_checkpointing_enable() |
|
model_save_path = '/data/lora_finetuned_model' # Specify your desired save path |
|
model.save_pretrained(model_save_path) |
|
''' |
|
|
|
def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
try: |
|
torch.cuda.empty_cache() |
|
torch.nn.CrossEntropyLoss() |
|
metric = evaluate.load("rouge", cache_dir='/data/cache') |
|
def compute_metrics(eval_preds): |
|
preds, labels = eval_preds |
|
if isinstance(preds, tuple): |
|
preds = preds[0] |
|
|
|
preds = np.where(preds != -100, preds, tokenizer.pad_token_id) |
|
|
|
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) |
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) |
|
|
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) |
|
|
|
result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) |
|
result = {k: round(v * 100, 4) for k, v in result.items()} |
|
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] |
|
result["gen_len"] = np.mean(prediction_lens) |
|
accuracy = accuracy_score(decoded_labels, decoded_preds) |
|
result["eval_accuracy"] = round(accuracy * 100, 4) |
|
return result |
|
|
|
login(api_key.strip()) |
|
|
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
remove_unused_columns=False, |
|
torch_empty_cache_steps=100, |
|
overwrite_output_dir=True, |
|
output_dir='/data/results', |
|
eval_strategy="steps", |
|
save_strategy='steps', |
|
learning_rate=lr*0.00001, |
|
per_device_train_batch_size=int(batch_size), |
|
per_device_eval_batch_size=int(batch_size), |
|
num_train_epochs=int(num_epochs), |
|
weight_decay=0.01, |
|
|
|
|
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
greater_is_better=True, |
|
logging_dir='/data/logs', |
|
logging_steps=200, |
|
|
|
hub_model_id=hub_id.strip(), |
|
fp16=True, |
|
|
|
save_steps=200, |
|
save_total_limit=3, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8') |
|
|
|
|
|
max_length = 512 |
|
|
|
def tokenize_function(examples): |
|
|
|
|
|
model_inputs = tokenizer( |
|
examples['text'], |
|
max_length=max_length, |
|
truncation=True, |
|
padding='max_length', |
|
return_tensors='pt', |
|
) |
|
|
|
|
|
labels = tokenizer( |
|
examples['target'], |
|
max_length=128, |
|
truncation=True, |
|
padding='max_length', |
|
|
|
return_tensors='pt', |
|
) |
|
|
|
|
|
|
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
|
|
|
|
column_names = ['text', 'target'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset(dataset_name.strip()) |
|
for o, d in enumerate(dataset['validation']['text']): |
|
if not isinstance(d, str): |
|
print('hit') |
|
print(type(d)) |
|
print(o) |
|
for o, d in enumerate(dataset['validation']['target']): |
|
if not isinstance(d, str): |
|
print('hit') |
|
print(type(d)) |
|
print(o) |
|
return 'done' |
|
|
|
|
|
dataset['train'] = dataset['train'].select(range(4000)) |
|
dataset['validation'] = dataset['validation'].select(range(200)) |
|
train_set = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
|
|
|
|
print('DONE') |
|
|
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_set['train'], |
|
eval_dataset=train_set['validation'], |
|
compute_metrics=compute_metrics, |
|
|
|
|
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
trainer.push_to_hub(commit_message="Training complete!") |
|
except Exception as e: |
|
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}" |
|
return 'DONE!' |
|
''' |
|
# Define Gradio interface |
|
def predict(text): |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
|
outputs = model(inputs) |
|
predictions = outputs.logits.argmax(dim=-1) |
|
return predictions.item() |
|
''' |
|
|
|
@spaces.GPU(duration=120) |
|
def run_train(dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad): |
|
def initialize_weights(model): |
|
for name, param in model.named_parameters(): |
|
if 'encoder.block.0.layer.0.DenseReluDense.wi.weight' in name: |
|
torch.nn.init.xavier_uniform_(param.data) |
|
elif 'encoder.block.0.layer.0.DenseReluDense.wo.weight' in name: |
|
torch.nn.init.kaiming_normal_(param.data) |
|
|
|
config = AutoConfig.from_pretrained("google/t5-efficient-tiny") |
|
model = AutoModelForSeq2SeqLM.from_config(config) |
|
initialize_weights(model) |
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
lora_dropout=0.1, |
|
bias="none" |
|
) |
|
|
|
result = fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad) |
|
return result |
|
|
|
try: |
|
iface = gr.Interface( |
|
fn=run_train, |
|
inputs=[ |
|
gr.Textbox(label="Dataset Name (e.g., 'imdb')"), |
|
gr.Textbox(label="HF hub to push to after training"), |
|
gr.Textbox(label="HF API token"), |
|
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1), |
|
gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1), |
|
gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1), |
|
gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model", |
|
description="This interface allows you to fine-tune a Hugging Face model on a specified dataset." |
|
) |
|
''' |
|
iface = gr.Interface( |
|
fn=predict, |
|
inputs=[ |
|
gr.Textbox(label="Query"), |
|
], |
|
outputs="text", |
|
title="Fine-Tune Hugging Face Model", |
|
description="This interface allows you to test a fine-tune Hugging Face model." |
|
) |
|
''' |
|
|
|
iface.launch() |
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}") |