Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,35 +1,100 @@
|
|
|
|
1 |
import pandas as pd
|
2 |
import torch
|
3 |
-
|
|
|
|
|
|
|
4 |
import streamlit as st
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
|
|
|
|
|
|
17 |
|
|
|
|
|
18 |
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
|
19 |
|
20 |
-
# Run encoding through model to get classification output.
|
21 |
-
encoding = tokenizer.encode(txt, return_tensors='pt')
|
22 |
-
result = model(encoding)
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# Write results
|
33 |
st.write("Classification Probabilities")
|
34 |
st.write(f"{neutralProb:.4f} - NEUTRAL")
|
35 |
-
st.write(f"{toxicProb:.4f} - TOXIC")
|
|
|
1 |
+
import numpy as np
|
2 |
import pandas as pd
|
3 |
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.utils.data import Dataset, DataLoader
|
6 |
+
from transformers import AutoTokenizer, BertModel
|
7 |
+
from sklearn import metrics
|
8 |
import streamlit as st
|
9 |
|
10 |
+
class ToxicityDataset(Dataset):
|
11 |
+
def __init__(self, dataframe, tokenizer, max_len):
|
12 |
+
self.tokenizer = tokenizer
|
13 |
+
self.data = dataframe
|
14 |
+
self.text = self.data.comment_text
|
15 |
+
self.targets = self.data.labels
|
16 |
+
self.max_len = max_len
|
17 |
|
18 |
+
def __len__(self):
|
19 |
+
return len(self.text)
|
20 |
+
|
21 |
+
def __getitem__(self, index):
|
22 |
+
text = str(self.text[index])
|
23 |
+
text = " ".join(text.split())
|
24 |
+
|
25 |
+
inputs = self.tokenizer.encode_plus(
|
26 |
+
text,
|
27 |
+
None,
|
28 |
+
add_special_tokens=True,
|
29 |
+
max_length=self.max_len,
|
30 |
+
padding="max_length",
|
31 |
+
truncation=True,
|
32 |
+
return_token_type_ids=True,
|
33 |
+
)
|
34 |
+
ids = inputs["input_ids"]
|
35 |
+
mask = inputs["attention_mask"]
|
36 |
+
token_type_ids = inputs["token_type_ids"]
|
37 |
+
return {
|
38 |
+
"ids": torch.tensor(ids, dtype=torch.long),
|
39 |
+
"mask": torch.tensor(mask, dtype=torch.long),
|
40 |
+
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
|
41 |
+
"targets": torch.tensor(self.targets[index], dtype=torch.float),
|
42 |
+
}
|
43 |
+
|
44 |
+
|
45 |
+
def inference():
|
46 |
+
model.eval()
|
47 |
+
final_targets = []
|
48 |
+
final_outputs = []
|
49 |
+
with torch.no_grad():
|
50 |
+
for _, data in enumerate(testing_loader, 0):
|
51 |
+
ids = data["ids"].to(device, dtype=torch.long)
|
52 |
+
mask = data["mask"].to(device, dtype=torch.long)
|
53 |
+
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
|
54 |
+
targets = data["targets"].to(device, dtype=torch.float)
|
55 |
+
outputs = model(ids, mask, token_type_ids)
|
56 |
+
final_targets.extend(targets.cpu().detach().numpy().tolist())
|
57 |
+
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
|
58 |
+
return final_outputs, final_targets
|
59 |
|
60 |
+
bert_path = "bert-base-uncased"
|
61 |
+
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
62 |
+
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
|
63 |
+
tuned_model = model = torch.load("pytorch_bert_toxic.bin")
|
64 |
|
65 |
+
tweets_raw = pd.read_csv("test.csv", nrows=20)
|
66 |
+
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
|
67 |
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
|
68 |
|
|
|
|
|
|
|
69 |
|
70 |
+
MAX_LENGTH = 100
|
71 |
+
TEST_BATCH_SIZE = 128
|
72 |
+
test_dataset = ToxicityDataset(test_df, tokenizer, MAX_LENGTH)
|
73 |
+
test_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
|
74 |
+
testing_loader = DataLoader(test_dataset, **test_params)
|
75 |
+
|
76 |
+
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
|
77 |
+
if option == "BERT":
|
78 |
+
tokenizer = bert_tokenizer
|
79 |
+
model = bert_model
|
80 |
+
else:
|
81 |
+
tokenizer = bert_tokenizer
|
82 |
+
model = tuned_model
|
83 |
+
|
84 |
+
prediction, targets = inference()
|
85 |
+
prediction = np.array(prediction) >= 0.5
|
86 |
+
|
87 |
+
targets = np.argmax(targets, axis=1)
|
88 |
+
prediction = np.argmax(prediction, axis=1)
|
89 |
+
accuracy = metrics.accuracy_score(targets, prediction)
|
90 |
+
f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
|
91 |
+
f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
|
92 |
+
|
93 |
+
print(f"Accuracy Score = {accuracy}")
|
94 |
+
print(f"F1 Score (Micro) = {f1_score_micro}")
|
95 |
+
print(f"F1 Score (Macro) = {f1_score_macro}")
|
96 |
|
97 |
# Write results
|
98 |
st.write("Classification Probabilities")
|
99 |
st.write(f"{neutralProb:.4f} - NEUTRAL")
|
100 |
+
st.write(f"{toxicProb:.4f} - TOXIC")
|