Spaces:
Runtime error
Runtime error
File size: 3,539 Bytes
617d510 d63d827 d1e02d2 617d510 c093cb0 617d510 7ccc2c1 617d510 27817dc 617d510 27817dc 617d510 1c2d1c1 27817dc b71697e 617d510 b71697e d63d827 617d510 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel
from sklearn import metrics
import streamlit as st
class ToxicityDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.text = self.data.comment_text
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
padding="max_length",
truncation=True,
return_token_type_ids=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
"targets": torch.tensor(self.targets[index], dtype=torch.float),
}
def inference():
model.eval()
final_targets = []
final_outputs = []
with torch.no_grad():
for _, data in enumerate(testing_loader, 0):
ids = data["ids"].to(device, dtype=torch.long)
mask = data["mask"].to(device, dtype=torch.long)
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
targets = data["targets"].to(device, dtype=torch.float)
outputs = model(ids, mask, token_type_ids)
final_targets.extend(targets.cpu().detach().numpy().tolist())
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return final_outputs, final_targets
bert_path = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
tuned_model = model = torch.load("pytorch_bert_toxic.bin")
tweets_raw = pd.read_csv("test.csv", nrows=20)
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
MAX_LENGTH = 100
TEST_BATCH_SIZE = 128
test_dataset = ToxicityDataset(test_df, tokenizer, MAX_LENGTH)
test_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
testing_loader = DataLoader(test_dataset, **test_params)
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
if option == "BERT":
tokenizer = bert_tokenizer
model = bert_model
else:
tokenizer = bert_tokenizer
model = tuned_model
prediction, targets = inference()
prediction = np.array(prediction) >= 0.5
targets = np.argmax(targets, axis=1)
prediction = np.argmax(prediction, axis=1)
accuracy = metrics.accuracy_score(targets, prediction)
f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
# Write results
st.write("Classification Probabilities")
st.write(f"{neutralProb:.4f} - NEUTRAL")
st.write(f"{toxicProb:.4f} - TOXIC")
|