Spaces:
Runtime error
Runtime error
from transformers import DistilBertTokenizerFast, DistilBertModel, AdamW | |
import torch | |
from torch.utils.data import Dataset, DataLoader | |
import pandas as pd | |
# assignment 3 | |
model_name = "distilbert-base-uncased" | |
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) | |
print("Reading data...") | |
data = pd.read_csv("./data/train.csv") | |
toxic_data = pd.DataFrame() | |
toxic_data["text"] = data["comment_text"] | |
toxic_data["labels"] = data.iloc[:, 2:].values.tolist() | |
print(toxic_data.head()) | |
class ToxicDataset(Dataset): | |
def __init__(self, dataframe, tokenizer): | |
self.tokenizer = tokenizer | |
self.data = dataframe | |
self.text = dataframe.text | |
self.labels = self.data.labels | |
def __len__(self): | |
return len(self.text) | |
def __getitem__(self, idx): | |
text = str(self.text[idx]) | |
if len(text) > 12: | |
text = text[:12] | |
inputs = self.tokenizer.encode_plus( | |
text, | |
None, | |
max_length=12, | |
add_special_tokens=True, | |
pad_to_max_length=True, | |
return_token_type_ids=True | |
) | |
ids = inputs["input_ids"] | |
mask = inputs["attention_mask"] | |
token_type_ids = inputs["token_type_ids"] | |
return { | |
"ids": torch.tensor(ids, dtype=torch.long), | |
"mask": torch.tensor(mask, dtype=torch.long), | |
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long), | |
"targets": torch.tensor(self.labels[idx], dtype=torch.float) | |
} | |
print("Data read. Splitting data...") | |
train_data = toxic_data.sample(frac=.8) | |
test_data = toxic_data.drop(train_data.index).reset_index(drop=True) | |
train_data = train_data.reset_index(drop=True) | |
print("Data split. Tokenizing data...") | |
train_set = ToxicDataset(train_data, tokenizer) | |
test_set = ToxicDataset(test_data, tokenizer) | |
train_loader = DataLoader(train_set, batch_size=8, shuffle=True, num_workers=0) | |
test_loader = DataLoader(test_set, batch_size=8, shuffle=True, num_workers=0) | |
print("Data tokenized. Beginning training...") | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
model = DistilBertModel.from_pretrained(model_name) | |
model.to(device) | |
model.train() | |
optim = AdamW(model.parameters(), lr=5e-5) | |
num_train_epochs = 2 | |
for epoch in range(num_train_epochs): | |
for batch in train_loader: | |
optim.zero_grad() | |
input_ids = batch["ids"].to(device) | |
attention_mask = batch["mask"].to(device) | |
token_type_ids = batch["token_type_ids"].to(device, dtype = torch.long) | |
targets = batch["targets"].to(device) | |
outputs = model(input_ids, attention_mask, token_type_ids) | |
loss = torch.nn.BCEWithLogitsLoss()(outputs, targets) | |
loss.backward() | |
optim.step() | |
model.eval() | |
print("Training complete. Saving model...") | |
save_directory = ".results/model" | |
model.save_pretrained(save_directory) | |
print("Model saved.") |