milestone-3 / app.py
nppmatt's picture
Update app.py
617d510
raw
history blame
3.54 kB
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel
from sklearn import metrics
import streamlit as st
class ToxicityDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.text = self.data.comment_text
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
padding="max_length",
truncation=True,
return_token_type_ids=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
"targets": torch.tensor(self.targets[index], dtype=torch.float),
}
def inference():
model.eval()
final_targets = []
final_outputs = []
with torch.no_grad():
for _, data in enumerate(testing_loader, 0):
ids = data["ids"].to(device, dtype=torch.long)
mask = data["mask"].to(device, dtype=torch.long)
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
targets = data["targets"].to(device, dtype=torch.float)
outputs = model(ids, mask, token_type_ids)
final_targets.extend(targets.cpu().detach().numpy().tolist())
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return final_outputs, final_targets
bert_path = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
tuned_model = model = torch.load("pytorch_bert_toxic.bin")
tweets_raw = pd.read_csv("test.csv", nrows=20)
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
MAX_LENGTH = 100
TEST_BATCH_SIZE = 128
test_dataset = ToxicityDataset(test_df, tokenizer, MAX_LENGTH)
test_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
testing_loader = DataLoader(test_dataset, **test_params)
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
if option == "BERT":
tokenizer = bert_tokenizer
model = bert_model
else:
tokenizer = bert_tokenizer
model = tuned_model
prediction, targets = inference()
prediction = np.array(prediction) >= 0.5
targets = np.argmax(targets, axis=1)
prediction = np.argmax(prediction, axis=1)
accuracy = metrics.accuracy_score(targets, prediction)
f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
# Write results
st.write("Classification Probabilities")
st.write(f"{neutralProb:.4f} - NEUTRAL")
st.write(f"{toxicProb:.4f} - TOXIC")