File size: 5,015 Bytes
617d510
d63d827
d1e02d2
617d510
 
408fdfa
617d510
c093cb0
 
dc98418
 
 
 
 
22c2fe3
dc98418
 
36ec160
 
2db5d22
 
36ec160
 
 
 
 
 
 
6a6f9ef
617d510
 
 
 
 
 
 
7ccc2c1
617d510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36ec160
 
2db5d22
47c22ce
36ec160
73571fc
a01490b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73571fc
d97d18d
a01490b
 
408fdfa
c2dd68b
408fdfa
 
 
c2dd68b
408fdfa
 
a01490b
d97d18d
 
 
 
 
 
6407600
d97d18d
6a6f9ef
617d510
 
 
 
 
36ec160
617d510
 
 
 
a01490b
617d510
 
 
27817dc
b731238
617d510
0180558
b731238
0180558
 
 
 
08d468f
 
0180558
 
 
b731238
0180558
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel
from sklearn import metrics
import streamlit as st

# Define constants. Enable CUDA if available.
MAX_LENGTH = 100
INFER_BATCH_SIZE = 128
HEAD_DROP_OUT = 0.4

device = "cuda" if torch.cuda.is_available() else "cpu"
bert_path = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_path)

# Read and format data.
tweets_raw = pd.read_csv("test.csv", nrows=50)
labels_raw = pd.read_csv("test_labels.csv", nrows=50)

label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
label_vector = labels_raw[label_set].values.tolist()

tweet_df = tweets_raw[["comment_text"]]
tweet_df["labels"] = label_vector

# Dataset for loading tables into DataLoader
class ToxicityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = self.data.comment_text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_token_type_ids=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.targets[index], dtype=torch.float),
        }

# Based on user model selection, prepare Dataset and DataLoader
infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
infer_params = {"batch_size": INFER_BATCH_SIZE, "shuffle": False}
infer_loader = DataLoader(infer_dataset, **infer_params)

class BertClass(torch.nn.Module):
    def __init__(self):
        super(BertClass, self).__init__()
        self.l1 = BertModel.from_pretrained(bert_path)
        self.dropout = torch.nn.Dropout(HEAD_DROP_OUT)
        self.classifier = torch.nn.Linear(768, 6)

    # return_dict must equal False for Huggingface Transformers v4+
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False,
        )
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

class PretrainedBertClass(torch.nn.Module):
    def __init__(self):
        super(PretrainedBertClass, self).__init__()
        self.l1 = BertModel.from_pretrained(bert_path)
        self.l2 = torch.nn.Dropout(HEAD_DROP_OUT)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

# User selects model for front-end.
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
if option == "BERT":
    model = PretrainedBertClass()
else:
    model = torch.load("pytorch_bert_toxic.bin", map_location=torch.device("cpu"))

# Freeze model and input tokens
def inference():
    model.eval()
    final_targets = []
    final_outputs = []
    with torch.no_grad():
        for _, data in enumerate(infer_loader, 0):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)
            final_targets.extend(targets.cpu().detach().numpy().tolist())
            final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return final_outputs, final_targets

# Get predictions!
prediction, targets = inference()

# Format and present findings.
best_preds = []
best_labels = []
for example in prediction:
    i = np.argmax(example)
    best_prediction = example[i]
    best_label = label_set[i]
    best_preds.append(best_prediction)
    best_labels.append(best_label)

st.write("Toxicity Classification Result:")
display_table = tweets_raw[["comment_text"]]
display_table["Toxicity Classification"] = best_labels
display_table["Probability"] = best_preds
st.write(display_table)