Spaces:
Runtime error
Runtime error
File size: 5,015 Bytes
617d510 d63d827 d1e02d2 617d510 408fdfa 617d510 c093cb0 dc98418 22c2fe3 dc98418 36ec160 2db5d22 36ec160 6a6f9ef 617d510 7ccc2c1 617d510 36ec160 2db5d22 47c22ce 36ec160 73571fc a01490b 73571fc d97d18d a01490b 408fdfa c2dd68b 408fdfa c2dd68b 408fdfa a01490b d97d18d 6407600 d97d18d 6a6f9ef 617d510 36ec160 617d510 a01490b 617d510 27817dc b731238 617d510 0180558 b731238 0180558 08d468f 0180558 b731238 0180558 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel
from sklearn import metrics
import streamlit as st
# Define constants. Enable CUDA if available.
MAX_LENGTH = 100
INFER_BATCH_SIZE = 128
HEAD_DROP_OUT = 0.4
device = "cuda" if torch.cuda.is_available() else "cpu"
bert_path = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_path)
# Read and format data.
tweets_raw = pd.read_csv("test.csv", nrows=50)
labels_raw = pd.read_csv("test_labels.csv", nrows=50)
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
label_vector = labels_raw[label_set].values.tolist()
tweet_df = tweets_raw[["comment_text"]]
tweet_df["labels"] = label_vector
# Dataset for loading tables into DataLoader
class ToxicityDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.text = self.data.comment_text
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
padding="max_length",
truncation=True,
return_token_type_ids=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
"targets": torch.tensor(self.targets[index], dtype=torch.float),
}
# Based on user model selection, prepare Dataset and DataLoader
infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
infer_params = {"batch_size": INFER_BATCH_SIZE, "shuffle": False}
infer_loader = DataLoader(infer_dataset, **infer_params)
class BertClass(torch.nn.Module):
def __init__(self):
super(BertClass, self).__init__()
self.l1 = BertModel.from_pretrained(bert_path)
self.dropout = torch.nn.Dropout(HEAD_DROP_OUT)
self.classifier = torch.nn.Linear(768, 6)
# return_dict must equal False for Huggingface Transformers v4+
def forward(self, input_ids, attention_mask, token_type_ids):
output_1 = self.l1(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
return_dict=False,
)
hidden_state = output_1[0]
pooler = hidden_state[:, 0]
pooler = self.dropout(pooler)
output = self.classifier(pooler)
return output
class PretrainedBertClass(torch.nn.Module):
def __init__(self):
super(PretrainedBertClass, self).__init__()
self.l1 = BertModel.from_pretrained(bert_path)
self.l2 = torch.nn.Dropout(HEAD_DROP_OUT)
self.l3 = torch.nn.Linear(768, 6)
def forward(self, ids, mask, token_type_ids):
_, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
output_2 = self.l2(output_1)
output = self.l3(output_2)
return output
# User selects model for front-end.
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
if option == "BERT":
model = PretrainedBertClass()
else:
model = torch.load("pytorch_bert_toxic.bin", map_location=torch.device("cpu"))
# Freeze model and input tokens
def inference():
model.eval()
final_targets = []
final_outputs = []
with torch.no_grad():
for _, data in enumerate(infer_loader, 0):
ids = data["ids"].to(device, dtype=torch.long)
mask = data["mask"].to(device, dtype=torch.long)
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
targets = data["targets"].to(device, dtype=torch.float)
outputs = model(ids, mask, token_type_ids)
final_targets.extend(targets.cpu().detach().numpy().tolist())
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return final_outputs, final_targets
# Get predictions!
prediction, targets = inference()
# Format and present findings.
best_preds = []
best_labels = []
for example in prediction:
i = np.argmax(example)
best_prediction = example[i]
best_label = label_set[i]
best_preds.append(best_prediction)
best_labels.append(best_label)
st.write("Toxicity Classification Result:")
display_table = tweets_raw[["comment_text"]]
display_table["Toxicity Classification"] = best_labels
display_table["Probability"] = best_preds
st.write(display_table) |