Spaces:
Runtime error
Runtime error
File size: 4,462 Bytes
617d510 d63d827 d1e02d2 617d510 ec300ee 617d510 c093cb0 ec300ee 36ec160 5a5a226 36ec160 6a6f9ef 617d510 7ccc2c1 617d510 36ec160 b420b00 36ec160 b420b00 47c22ce 36ec160 6a6f9ef 617d510 36ec160 617d510 27817dc 617d510 6a6f9ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification
from sklearn import metrics
import streamlit as st
# Have data for BertClass ready for our tuned model.
class BertClass(torch.nn.Module):
def __init__(self):
super(BertClass, self).__init__()
self.l1 = BertModel.from_pretrained(model_path)
self.dropout = torch.nn.Dropout(HEAD_DROP_OUT)
self.classifier = torch.nn.Linear(768, 6)
def forward(self, input_ids, attention_mask, token_type_ids):
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
hidden_state = output_1[0]
pooler = hidden_state[:, 0]
pooler = self.dropout(pooler)
output = self.classifier(pooler)
return output
# Define models to be used
bert_path = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
tuned_model = model = torch.load("pytorch_bert_toxic.bin", map_location=torch.device("cpu"))
# Read and format data.
tweets_raw = pd.read_csv("test.csv", nrows=20)
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
label_vector = labels_raw[label_set].values.tolist()
tweet_df = tweets_raw[["comment_text"]]
tweet_df["labels"] = label_vector
# User selects model for front-end.
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
if option == "BERT":
tokenizer = bert_tokenizer
model = bert_model
else:
tokenizer = bert_tokenizer
model = tuned_model
# Dataset for loading tables into DataLoader
class ToxicityDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.text = self.data.comment_text
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
padding="max_length",
truncation=True,
return_token_type_ids=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
"targets": torch.tensor(self.targets[index], dtype=torch.float),
}
# Based on user model selection, prepare Dataset and DataLoader
MAX_LENGTH = 100
INFER_BATCH_SIZE = 128
infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
infer_params = {"batch_size": INFER_BATCH_SIZE, "shuffle": False}
infer_loader = DataLoader(infer_dataset, **infer_params)
# Freeze model and input tokens
def inference():
model.eval()
final_targets = []
final_outputs = []
with torch.no_grad():
for _, data in enumerate(infer_loader, 0):
ids = data["ids"].to(device, dtype=torch.long)
mask = data["mask"].to(device, dtype=torch.long)
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
targets = data["targets"].to(device, dtype=torch.float)
outputs = model(ids, mask, token_type_ids)
final_targets.extend(targets.cpu().detach().numpy().tolist())
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return final_outputs, final_targets
prediction, targets = inference()
prediction = np.array(prediction) >= 0.5
targets = np.argmax(targets, axis=1)
prediction = np.argmax(prediction, axis=1)
accuracy = metrics.accuracy_score(targets, prediction)
f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
st.write(prediction)
st.write(f"Accuracy Score = {accuracy}")
st.write(f"F1 Score (Micro) = {f1_score_micro}")
st.write(f"F1 Score (Macro) = {f1_score_macro}")
|