Spaces:
Runtime error
Runtime error
File size: 3,087 Bytes
01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e 01769d2 cf5d81e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
# from torch.optim import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
# assignment 3
model_name = "distilbert-base-uncased"
class ToxicDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx])
print(item)
return item
def __len__(self):
return len(self.labels)
print("Reading data...")
data = pd.read_csv("./data/train.csv")
toxic_data = pd.DataFrame()
toxic_data["text"] = data["comment_text"]
toxic_data["labels"] = data.iloc[:, 2:].values.tolist()
print("Data read. Splitting data...")
train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.text.to_list(), toxic_data.labels.to_list(), test_size=.2)
print("Data split. Tokenizing data...")
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
train_dataset = ToxicDataset(train_encodings, train_labels)
val_dataset = ToxicDataset(val_encodings, val_labels)
print("Data tokenized. Beginning training...")
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=2,
per_device_train_batch_size=4,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
)
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
# model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)
# model.to(device)
# model.train()
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# optim = AdamW(model.parameters(), lr=5e-5)
# num_train_epochs = 2
# for epoch in range(num_train_epochs):
# for batch in train_loader:
# optim.zero_grad()
# input_ids = batch["input_ids"].to(device)
# attention_mask = batch["attention_mask"].to(device)
# labels = batch["labels"].to(device)
# outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
# loss = outputs[0]
# loss.backward()
# optim.step()
# model.eval()
print("Training complete. Saving model...")
save_directory = ".results/model"
model.save_pretrained(save_directory)
print("Model saved.") |