Spaces:
Runtime error
Runtime error
File size: 4,647 Bytes
d31451e e55d629 d31451e e55d629 d31451e e55d629 d31451e e55d629 d31451e e55d629 d31451e ef7bc7f d31451e 24e04ee d31451e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import torch
import torch.nn as TNN
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset as set, DataLoader as DL
from torch import cuda
import streamlit as st
from transformers import BertTokenizer as BT, BertModel as BM
device = 'cuda' if cuda.is_available() else 'cpu'
# Defined variables for later use
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
LEARNING_RATE = 5e-05
modName = 'bert-base-uncased' # Pre-trained model
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] # Labels
data = pd.read_csv('./train.csv')
data.drop(['id'], inplace=True, axis=1)
new = pd.DataFrame()
new['text'] = data['comment_text']
new['labels'] = data.iloc[:,1].values.tolist()
tokenizer = BT.from_pretrained(modName, truncation=True, do_lower_case=True)
class MultiLabelDataset(set):
def __init__(self, df, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = df
self.text = df.text
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.targets)
def __getitem__(self, idx):
text = str(self.text[idx])
text = " ".join(text.split())
ins = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True
)
input_ids = ins['input_ids']
attention_mask = ins['attention_mask']
token_type_ids = ins["token_type_ids"]
#st.write("Input Keys: ", ins.keys()) # was used for debugging
return {
'input_ids': torch.tensor(input_ids, dtype=torch.long),
'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[idx], dtype=torch.float)
}
trainSize = 0.8
trainData = new.sample(frac=trainSize,random_state=200)
testData = new.drop(trainData.index).reset_index(drop=True)
trainData = trainData.reset_index(drop=True)
trainSet = MultiLabelDataset(trainData, tokenizer, MAX_LEN)
testSet = MultiLabelDataset(testData, tokenizer, MAX_LEN)
training_loader = DL(trainSet, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
testing_loader = DL(testSet, batch_size=VALID_BATCH_SIZE, shuffle=True)
# neural network
class BERTClass(TNN.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.l1 = BM.from_pretrained(modName)
self.pre_classifier = TNN.Linear(768, 768)
self.dropout = TNN.Dropout(0.1)
self.classifier = TNN.Linear(768, 6)
def forward(self, input_ids, attention_mask, token_type_ids):
out = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
hidden_state = out[0]
po = hidden_state[:, 0]
po = self.pre_classifier(po)
po = TNN.Tanh()(po)
po = self.dropout(po)
outs = self.classifier(po)
return outs
mod = BERTClass()
mod.to(device)
# Loss function and Optimizer
def lossFN(outs, targets):
targets = targets.unsqueeze(1).expand_as(outs)
return TNN.BCEWithLogitsLoss()(outs, targets)
opt = torch.optim.Adam(mod.parameters(), lr=LEARNING_RATE)
# Training and Finetuning
def train(mod, training_loader):
mod.train()
for _, data in tqdm(enumerate(training_loader, 0)):
input_ids = data['input_ids'].to(device, dtype=torch.long)
attention_mask = data['attention_mask'].to(device, dtype=torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
targets = data['targets'].to(device, dtype=torch.float)
outs = mod(input_ids, attention_mask, token_type_ids)
opt.zero_grad()
loss = lossFN(outs, targets)
loss.backward()
opt.step()
# StreamLit Table of Results
st.title("Finetuned Model for Toxicity")
st.subheader("Model: bert-base-uncased")
def predict(tweets):
mod.eval()
res = []
with torch.no_grad():
for ins in tweets:
outs = mod(input_ids=ins['input_ids'].to(device), attention_mask=ins['attention_mask'].to(device), token_type_ids=ins['token_type_ids'].to(device))
probs = torch.softmax(outs[0], dim=-1)
preds = torch.argmax(probs, dim=-1)
for i in range(len(tweets)):
res.append({'TWEETS': tweets, 'LABEL': preds[i].item(), 'PROBABILITY': probs[i][preds[i].item()].item()})
return res
res = predict(testing_loader)
st.table(res) # table
|