File size: 6,125 Bytes
bdc29c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
from huggingface_hub import *
# create_repo(repo_id="test-model")
import pandas as pd
from datasets import load_dataset
df_train = pd.read_csv("/home/prafull/apps_all/flan_tuning/FlanT5-train-test-idiomSimplifier.csv")
complex_sentences = df_train["Idiom sentences"].to_list()
simple_sentences = df_train["English casual"].to_list()
data_dict = {
"dialogue": complex_sentences,
"summary": simple_sentences
}
df_train_new = pd.DataFrame(data_dict)
# random shuffling
df_train_shuffled = df_train_new.sample(frac = 1, random_state=1)
# # Save pre-processed final data
df_train_shuffled.head(1000).to_csv("dialog_summary.csv", encoding="utf-8", index=False)
dataset = load_dataset("csv", data_files="dialog_summary.csv", split='train')
dataset = dataset.train_test_split(test_size=0.05)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
from datasets import concatenate_datasets
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")
max_target_length = max_source_length + 10
print(f"Max Target length: {max_target_length}")
def preprocess_function(sample,padding="max_length"):
# add prefix to the input for t5
inputs = ["Easy to understand Sentence without idioms and jargons: " + item for item in sample["dialogue"]]
# tokenize inputs
model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
# Tokenize targets with the `text_target` keyword argument
labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.
if padding == "max_length":
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
from transformers import AutoModelForSeq2SeqLM
# huggingface hub model id
model_id="google/flan-t5-base"
# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
# Metric
metric = evaluate.load("rouge")
# helper function to postprocess text
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]
# rougeLSum expects newline after each sentence
preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
labels = ["\n".join(sent_tokenize(label)) for label in labels]
return preds, labels
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
result = {k: round(v * 100, 4) for k, v in result.items()}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
return result
from transformers import DataCollatorForSeq2Seq
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=8
)
import torch
torch.cuda.set_device(0)
print(torch.cuda.current_device())
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
repository_id = f"flan-tuning"
# Define training args
training_args = Seq2SeqTrainingArguments(
overwrite_output_dir=True,
output_dir=repository_id,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
predict_with_generate=True,
fp16=False, # Overflows with fp16
learning_rate=5e-5,
num_train_epochs=1,
# logging & evaluation strategies
logging_dir=f"{repository_id}/logs",
logging_strategy="steps",
logging_steps=500,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
# metric_for_best_model="overall_f1",
# push to hub parameters
report_to="tensorboard",
push_to_hub=False,
hub_strategy="every_save",
hub_model_id=repository_id,
hub_token=HfFolder.get_token(),
)
# Create Trainer instance
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
compute_metrics=compute_metrics,
)
trainer.train()
# trainer.model.save_pretrained("/home/prafull/apps_all/ChatGPT_Playground/Flan_models/flan-t5-LARGE-IDIOM-24k", from_pt=True)
# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
# PUSH TO HUB ------------
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub() |