cs482-project / milestone_3.py
cgr28's picture
milestone-3
cf5d81e
raw
history blame
2.97 kB
from transformers import DistilBertTokenizerFast, DistilBertModel, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
# assignment 3
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
print("Reading data...")
data = pd.read_csv("./data/train.csv")
toxic_data = pd.DataFrame()
toxic_data["text"] = data["comment_text"]
toxic_data["labels"] = data.iloc[:, 2:].values.tolist()
print(toxic_data.head())
class ToxicDataset(Dataset):
def __init__(self, dataframe, tokenizer):
self.tokenizer = tokenizer
self.data = dataframe
self.text = dataframe.text
self.labels = self.data.labels
def __len__(self):
return len(self.text)
def __getitem__(self, idx):
text = str(self.text[idx])
if len(text) > 12:
text = text[:12]
inputs = self.tokenizer.encode_plus(
text,
None,
max_length=12,
add_special_tokens=True,
pad_to_max_length=True,
return_token_type_ids=True
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
"targets": torch.tensor(self.labels[idx], dtype=torch.float)
}
print("Data read. Splitting data...")
train_data = toxic_data.sample(frac=.8)
test_data = toxic_data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
print("Data split. Tokenizing data...")
train_set = ToxicDataset(train_data, tokenizer)
test_set = ToxicDataset(test_data, tokenizer)
train_loader = DataLoader(train_set, batch_size=8, shuffle=True, num_workers=0)
test_loader = DataLoader(test_set, batch_size=8, shuffle=True, num_workers=0)
print("Data tokenized. Beginning training...")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = DistilBertModel.from_pretrained(model_name)
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)
num_train_epochs = 2
for epoch in range(num_train_epochs):
for batch in train_loader:
optim.zero_grad()
input_ids = batch["ids"].to(device)
attention_mask = batch["mask"].to(device)
token_type_ids = batch["token_type_ids"].to(device, dtype = torch.long)
targets = batch["targets"].to(device)
outputs = model(input_ids, attention_mask, token_type_ids)
loss = torch.nn.BCEWithLogitsLoss()(outputs, targets)
loss.backward()
optim.step()
model.eval()
print("Training complete. Saving model...")
save_directory = ".results/model"
model.save_pretrained(save_directory)
print("Model saved.")