Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from transformers import AutoTokenizer, BertModel
|
|
7 |
from sklearn import metrics
|
8 |
import streamlit as st
|
9 |
|
|
|
10 |
class ToxicityDataset(Dataset):
|
11 |
def __init__(self, dataframe, tokenizer, max_len):
|
12 |
self.tokenizer = tokenizer
|
@@ -41,7 +42,7 @@ class ToxicityDataset(Dataset):
|
|
41 |
"targets": torch.tensor(self.targets[index], dtype=torch.float),
|
42 |
}
|
43 |
|
44 |
-
|
45 |
def inference():
|
46 |
model.eval()
|
47 |
final_targets = []
|
@@ -57,22 +58,23 @@ def inference():
|
|
57 |
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
|
58 |
return final_outputs, final_targets
|
59 |
|
|
|
60 |
bert_path = "bert-base-uncased"
|
61 |
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
62 |
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
|
63 |
tuned_model = model = torch.load("pytorch_bert_toxic.bin")
|
64 |
|
|
|
65 |
tweets_raw = pd.read_csv("test.csv", nrows=20)
|
66 |
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
|
67 |
-
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
|
68 |
|
|
|
|
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
test_dataset = ToxicityDataset(test_df, tokenizer, MAX_LENGTH)
|
73 |
-
test_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
|
74 |
-
testing_loader = DataLoader(test_dataset, **test_params)
|
75 |
|
|
|
76 |
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
|
77 |
if option == "BERT":
|
78 |
tokenizer = bert_tokenizer
|
@@ -81,20 +83,22 @@ else:
|
|
81 |
tokenizer = bert_tokenizer
|
82 |
model = tuned_model
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
prediction, targets = inference()
|
85 |
prediction = np.array(prediction) >= 0.5
|
86 |
-
|
87 |
targets = np.argmax(targets, axis=1)
|
88 |
prediction = np.argmax(prediction, axis=1)
|
89 |
accuracy = metrics.accuracy_score(targets, prediction)
|
90 |
f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
|
91 |
f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
# Write results
|
98 |
-
st.write("Classification Probabilities")
|
99 |
-
st.write(f"{neutralProb:.4f} - NEUTRAL")
|
100 |
-
st.write(f"{toxicProb:.4f} - TOXIC")
|
|
|
7 |
from sklearn import metrics
|
8 |
import streamlit as st
|
9 |
|
10 |
+
# Dataset for loading tables into DataLoader
|
11 |
class ToxicityDataset(Dataset):
|
12 |
def __init__(self, dataframe, tokenizer, max_len):
|
13 |
self.tokenizer = tokenizer
|
|
|
42 |
"targets": torch.tensor(self.targets[index], dtype=torch.float),
|
43 |
}
|
44 |
|
45 |
+
# Freeze model and input tokens
|
46 |
def inference():
|
47 |
model.eval()
|
48 |
final_targets = []
|
|
|
58 |
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
|
59 |
return final_outputs, final_targets
|
60 |
|
61 |
+
# Define models to be used
|
62 |
bert_path = "bert-base-uncased"
|
63 |
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
64 |
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
|
65 |
tuned_model = model = torch.load("pytorch_bert_toxic.bin")
|
66 |
|
67 |
+
# Read and format data.
|
68 |
tweets_raw = pd.read_csv("test.csv", nrows=20)
|
69 |
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
|
|
|
70 |
|
71 |
+
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
|
72 |
+
label_vector = labels_raw[label_set].values.tolist()
|
73 |
|
74 |
+
tweet_df = tweets_raw[["comment_text"]]
|
75 |
+
tweet_df["labels"] = label_vector
|
|
|
|
|
|
|
76 |
|
77 |
+
# User selects model for front-end.
|
78 |
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
|
79 |
if option == "BERT":
|
80 |
tokenizer = bert_tokenizer
|
|
|
83 |
tokenizer = bert_tokenizer
|
84 |
model = tuned_model
|
85 |
|
86 |
+
# Based on user selection, prepare DataLoader
|
87 |
+
MAX_LENGTH = 100
|
88 |
+
TEST_BATCH_SIZE = 128
|
89 |
+
infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
|
90 |
+
infer_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
|
91 |
+
testing_loader = DataLoader(test_dataset, **test_params)
|
92 |
+
|
93 |
prediction, targets = inference()
|
94 |
prediction = np.array(prediction) >= 0.5
|
|
|
95 |
targets = np.argmax(targets, axis=1)
|
96 |
prediction = np.argmax(prediction, axis=1)
|
97 |
accuracy = metrics.accuracy_score(targets, prediction)
|
98 |
f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
|
99 |
f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
|
100 |
|
101 |
+
st.write(prediction)
|
102 |
+
st.write(f"Accuracy Score = {accuracy}")
|
103 |
+
st.write(f"F1 Score (Micro) = {f1_score_micro}")
|
104 |
+
st.write(f"F1 Score (Macro) = {f1_score_macro}")
|
|
|
|
|
|
|
|