snaramirez872 commited on
Commit
e263a89
·
1 Parent(s): 10935d0

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -162
app.py DELETED
@@ -1,162 +0,0 @@
1
- import transformers as TRNSFM
2
- import torch
3
- import torch.nn as TNN
4
- import numpy as np
5
- import pandas as pd
6
- from tqdm import tqdm
7
- from sklearn import metrics
8
- from torch.utils.data import Dataset as set, DataLoader as DL
9
- from torch import cuda
10
- import streamlit as st
11
- from transformers import BertTokenizer as BT, BertModel as BM
12
-
13
- # Defined variables for later use
14
- MAX_LEN = 128
15
- TRAIN_BATCH_SIZE = 4
16
- VALID_BATCH_SIZE = 4
17
- LEARNING_RATE = 5e-05
18
-
19
- modName = 'bert-base-uncased' # Pre-trained model
20
-
21
- categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] # Labels
22
-
23
- device = 'cuda' if cuda.is_available() else 'cpu'
24
-
25
- def ham(y_true, y_pred, normalize=True, sample_weight=None):
26
- accLiist = []
27
- for i in range(y_true.shape[0]):
28
- true = set( np.where(y_true[i])[0] )
29
- pred = set( np.where(y_pred[i])[0] )
30
- tempA = None
31
- if len(true) == 0 and len(pred) == 0:
32
- tempA = 1
33
- else:
34
- tempA = len(true.intersection(pred))/\
35
- float( len(true.union(pred)) )
36
- accLiist.append(tempA)
37
- return np.mean(accLiist)
38
-
39
- data = pd.read_csv('./train.csv')
40
- data.drop(['id'], inplace=True, axis=1)
41
-
42
- new = pd.DataFrame()
43
- new['text'] = data['comment_text']
44
- new['labels'] = data.iloc[:,1].values.tolist()
45
-
46
- tokenizer = BT.from_pretrained(modName, truncation=True, do_lower_case=True)
47
-
48
- class MultiLabelDataset(set):
49
- def __init__(self, df, tokenizer, max_len):
50
- self.tokenizer = tokenizer
51
- self.data = df
52
- self.text = df.text
53
- self.targets = self.data.labels
54
- self.max_len = max_len
55
-
56
- def __len__(self):
57
- return len(self.targets)
58
-
59
- def __getitem__(self, idx):
60
- text = str(self.text[idx])
61
- text = " ".join(text.split())
62
-
63
- ins = self.tokenizer.encode_plus(
64
- text,
65
- None,
66
- add_special_tokens=True,
67
- max_length=self.max_len,
68
- pad_to_max_length=True,
69
- return_token_type_ids=True
70
- )
71
- input_ids = ins['input_ids']
72
- attention_mask = ins['attention_mask']
73
- token_type_ids = ins["token_type_ids"]
74
-
75
- #st.write("Input Keys: ", ins.keys()) # was used for debugging
76
- return {
77
- 'input_ids': torch.tensor(input_ids, dtype=torch.long),
78
- 'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
79
- 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
80
- 'targets': torch.tensor(self.targets[idx], dtype=torch.float)
81
- }
82
-
83
- # Dataset and DataLoader
84
- trainSize = 0.4
85
- trainData=new.sample(frac=trainSize,random_state=200)
86
- testData=new.drop(trainData.index).reset_index(drop=True)
87
- trainData = trainData.reset_index(drop=True)
88
-
89
- trainSet = MultiLabelDataset(trainData, tokenizer, MAX_LEN)
90
- testSet = MultiLabelDataset(testData, tokenizer, MAX_LEN)
91
-
92
- training_loader = DL(trainSet, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
93
- testing_loader = DL(testSet, batch_size=VALID_BATCH_SIZE, shuffle=True)
94
-
95
- # To Strings for Use
96
- test_loader_strings = []
97
- for dat in testing_loader:
98
- test_loader_strings += [d['input_ids'].tolist() for d in dat if isinstance(d, dict) and 'input_ids' in d]
99
-
100
- # model
101
- class DistilBERTClass(TNN.Module):
102
- def __init__(self):
103
- super(DistilBERTClass, self).__init__()
104
- self.l1 = BM.from_pretrained(modName)
105
- self.pre_classifier = TNN.Linear(768, 768)
106
- self.dropout = TNN.Dropout(0.1)
107
- self.classifier = TNN.Linear(768, 6)
108
-
109
- def forward(self, input_ids, attention_mask, token_type_ids):
110
- out = self.l1(input_ids=input_ids, attention_mask=attention_mask)
111
- hidden_state = out[0]
112
- po = hidden_state[:, 0]
113
- po = self.pre_classifier(po)
114
- po = TNN.Tanh()(po)
115
- po = self.dropout(po)
116
- outs = self.classifier(po)
117
- return outs
118
-
119
- mod = DistilBERTClass()
120
- mod.to(device)
121
-
122
- # Loss function and Optimizer
123
- def lossFN(outs, targets):
124
- targets = targets.unsqueeze(1).expand_as(outs)
125
- return TNN.BCEWithLogitsLoss()(outs, targets)
126
-
127
- opt = torch.optim.Adam(mod.parameters(), lr=LEARNING_RATE)
128
-
129
- # Training and Finetuning
130
- def train(mod, training_loader):
131
- mod.train()
132
- for _, data in tqdm(enumerate(training_loader, 0)):
133
- input_ids = data['input_ids'].to(device, dtype=torch.long)
134
- attention_mask = data['attention_mask'].to(device, dtype=torch.long)
135
- token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
136
- targets = data['targets'].to(device, dtype=torch.float)
137
-
138
- outs = mod(input_ids, attention_mask, token_type_ids)
139
-
140
- opt.zero_grad()
141
- loss = lossFN(outs, targets)
142
- loss.backward()
143
- opt.step()
144
-
145
- # StreamLit Table of Results
146
- st.title("Finetuned Model for Toxicity")
147
- st.subheader("Model: bert-base-uncased")
148
-
149
- def predict(tweets):
150
- mod.eval()
151
- res = []
152
- with torch.no_grad():
153
- for ins in testing_loader:
154
- outs = mod(input_ids=ins['input_ids'].to(device), attention_mask=ins['attention_mask'].to(device), token_type_ids=ins['token_type_ids'].to(device))
155
- probs = torch.softmax(outs[0], dim=-1)
156
- preds = torch.argmax(probs, dim=-1)
157
- for i in range(len(tweets)):
158
- res.append({'TWEETS': tweets[i], 'LABEL': id2label[preds[i].item()], 'PROBABILITY': probs[i][preds[i]].item()})
159
- return res
160
-
161
- res = predict(test_loader_strings)
162
- st.table(res) # table