snaramirez872 commited on
Commit
d31451e
·
1 Parent(s): c54c0b7

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +139 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as TNN
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ from torch.utils.data import Dataset as set, DataLoader as DL
6
+ from torch import cuda
7
+ import streamlit as st
8
+ from transformers import BertTokenizer as BT, BertModel as BM
9
+
10
+ device = 'cuda' if cuda.is_available() else 'cpu'
11
+
12
+ # Defined variables for later use
13
+ MAX_LEN = 128
14
+ TRAIN_BATCH_SIZE = 4
15
+ VALID_BATCH_SIZE = 4
16
+ LEARNING_RATE = 5e-05
17
+
18
+ modName = 'bert-base-uncased' # Pre-trained model
19
+ categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] # Labels
20
+
21
+ data = pd.read_csv('./train.csv')
22
+ data.drop(['id'], inplace=True, axis=1)
23
+
24
+ new = pd.DataFrame()
25
+ new['text'] = data['comment_text']
26
+ new['labels'] = data.iloc[:,1].values.tolist()
27
+
28
+ tokenizer = BT.from_pretrained(modName, truncation=True, do_lower_case=True)
29
+
30
+ class MultiLabelDataset(set):
31
+ def __init__(self, df, tokenizer, max_len):
32
+ self.tokenizer = tokenizer
33
+ self.data = df
34
+ self.text = df.text
35
+ self.targets = self.data.labels
36
+ self.max_len = max_len
37
+
38
+ def __len__(self):
39
+ return len(self.targets)
40
+
41
+ def __getitem__(self, idx):
42
+ text = str(self.text[idx])
43
+ text = " ".join(text.split())
44
+
45
+ ins = self.tokenizer.encode_plus(
46
+ text,
47
+ None,
48
+ add_special_tokens=True,
49
+ max_length=self.max_len,
50
+ pad_to_max_length=True,
51
+ return_token_type_ids=True
52
+ )
53
+ input_ids = ins['input_ids']
54
+ attention_mask = ins['attention_mask']
55
+ token_type_ids = ins["token_type_ids"]
56
+
57
+ #st.write("Input Keys: ", ins.keys()) # was used for debugging
58
+ return {
59
+ 'input_ids': torch.tensor(input_ids, dtype=torch.long),
60
+ 'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
61
+ 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
62
+ 'targets': torch.tensor(self.targets[idx], dtype=torch.float)
63
+ }
64
+
65
+ # Dataset and DataLoader
66
+ trainSize = 0.4
67
+ trainData = new.sample(frac=trainSize,random_state=200)
68
+ testData = new.drop(trainData.index).reset_index(drop=True)
69
+ trainData = trainData.reset_index(drop=True)
70
+
71
+ trainSet = MultiLabelDataset(trainData, tokenizer, MAX_LEN)
72
+ testSet = MultiLabelDataset(testData, tokenizer, MAX_LEN)
73
+
74
+ training_loader = DL(trainSet, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
75
+ testing_loader = DL(testSet, batch_size=VALID_BATCH_SIZE, shuffle=True)
76
+
77
+ # model
78
+ class DistilBERTClass(TNN.Module):
79
+ def __init__(self):
80
+ super(DistilBERTClass, self).__init__()
81
+ self.l1 = BM.from_pretrained(modName)
82
+ self.pre_classifier = TNN.Linear(768, 768)
83
+ self.dropout = TNN.Dropout(0.1)
84
+ self.classifier = TNN.Linear(768, 6)
85
+
86
+ def forward(self, input_ids, attention_mask, token_type_ids):
87
+ out = self.l1(input_ids=input_ids, attention_mask=attention_mask)
88
+ hidden_state = out[0]
89
+ po = hidden_state[:, 0]
90
+ po = self.pre_classifier(po)
91
+ po = TNN.Tanh()(po)
92
+ po = self.dropout(po)
93
+ outs = self.classifier(po)
94
+ return outs
95
+
96
+ mod = DistilBERTClass()
97
+ mod.to(device)
98
+
99
+ # Loss function and Optimizer
100
+ def lossFN(outs, targets):
101
+ targets = targets.unsqueeze(1).expand_as(outs)
102
+ return TNN.BCEWithLogitsLoss()(outs, targets)
103
+
104
+ opt = torch.optim.Adam(mod.parameters(), lr=LEARNING_RATE)
105
+
106
+ # Training and Finetuning
107
+ def train(mod, training_loader):
108
+ mod.train()
109
+ for _, data in tqdm(enumerate(training_loader, 0)):
110
+ input_ids = data['input_ids'].to(device, dtype=torch.long)
111
+ attention_mask = data['attention_mask'].to(device, dtype=torch.long)
112
+ token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
113
+ targets = data['targets'].to(device, dtype=torch.float)
114
+
115
+ outs = mod(input_ids, attention_mask, token_type_ids)
116
+
117
+ opt.zero_grad()
118
+ loss = lossFN(outs, targets)
119
+ loss.backward()
120
+ opt.step()
121
+
122
+ # StreamLit Table of Results
123
+ st.title("Finetuned Model for Toxicity")
124
+ st.subheader("Model: bert-base-uncased")
125
+
126
+ def predict(tweets):
127
+ mod.eval()
128
+ res = []
129
+ with torch.no_grad():
130
+ for ins in testing_loader:
131
+ outs = mod(input_ids=ins['input_ids'].to(device), attention_mask=ins['attention_mask'].to(device), token_type_ids=ins['token_type_ids'].to(device))
132
+ probs = torch.softmax(outs[0], dim=-1)
133
+ preds = torch.argmax(probs, dim=-1)
134
+ for i in range(len(tweets)):
135
+ res.append({'TWEETS': tweets[i], 'LABEL': id2label[preds[i].item()], 'PROBABILITY': probs[i][preds[i]].item()})
136
+ return res
137
+
138
+ res = predict(testing_loader)
139
+ st.table(res) # table
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ logging
2
+ transformers
3
+ torch
4
+ numpy
5
+ pandas
6
+ tqdm