Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,31 @@ from transformers import AutoTokenizer, BertModel
|
|
7 |
from sklearn import metrics
|
8 |
import streamlit as st
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Dataset for loading tables into DataLoader
|
11 |
class ToxicityDataset(Dataset):
|
12 |
def __init__(self, dataframe, tokenizer, max_len):
|
@@ -42,13 +67,20 @@ class ToxicityDataset(Dataset):
|
|
42 |
"targets": torch.tensor(self.targets[index], dtype=torch.float),
|
43 |
}
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# Freeze model and input tokens
|
46 |
def inference():
|
47 |
model.eval()
|
48 |
final_targets = []
|
49 |
final_outputs = []
|
50 |
with torch.no_grad():
|
51 |
-
for _, data in enumerate(
|
52 |
ids = data["ids"].to(device, dtype=torch.long)
|
53 |
mask = data["mask"].to(device, dtype=torch.long)
|
54 |
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
|
@@ -58,38 +90,6 @@ def inference():
|
|
58 |
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
|
59 |
return final_outputs, final_targets
|
60 |
|
61 |
-
# Define models to be used
|
62 |
-
bert_path = "bert-base-uncased"
|
63 |
-
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
64 |
-
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
|
65 |
-
tuned_model = model = torch.load("pytorch_bert_toxic.bin")
|
66 |
-
|
67 |
-
# Read and format data.
|
68 |
-
tweets_raw = pd.read_csv("test.csv", nrows=20)
|
69 |
-
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
|
70 |
-
|
71 |
-
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
|
72 |
-
label_vector = labels_raw[label_set].values.tolist()
|
73 |
-
|
74 |
-
tweet_df = tweets_raw[["comment_text"]]
|
75 |
-
tweet_df["labels"] = label_vector
|
76 |
-
|
77 |
-
# User selects model for front-end.
|
78 |
-
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
|
79 |
-
if option == "BERT":
|
80 |
-
tokenizer = bert_tokenizer
|
81 |
-
model = bert_model
|
82 |
-
else:
|
83 |
-
tokenizer = bert_tokenizer
|
84 |
-
model = tuned_model
|
85 |
-
|
86 |
-
# Based on user selection, prepare DataLoader
|
87 |
-
MAX_LENGTH = 100
|
88 |
-
TEST_BATCH_SIZE = 128
|
89 |
-
infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
|
90 |
-
infer_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
|
91 |
-
testing_loader = DataLoader(test_dataset, **test_params)
|
92 |
-
|
93 |
prediction, targets = inference()
|
94 |
prediction = np.array(prediction) >= 0.5
|
95 |
targets = np.argmax(targets, axis=1)
|
|
|
7 |
from sklearn import metrics
|
8 |
import streamlit as st
|
9 |
|
10 |
+
# Define models to be used
|
11 |
+
bert_path = "bert-base-uncased"
|
12 |
+
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
13 |
+
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
|
14 |
+
tuned_model = model = torch.load("pytorch_bert_toxic.bin")
|
15 |
+
|
16 |
+
# Read and format data.
|
17 |
+
tweets_raw = pd.read_csv("test.csv", nrows=20)
|
18 |
+
labels_raw = pd.read_csv("test_labels.csv", nrows=20)
|
19 |
+
|
20 |
+
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
|
21 |
+
label_vector = labels_raw[label_set].values.tolist()
|
22 |
+
|
23 |
+
tweet_df = tweets_raw[["comment_text"]]
|
24 |
+
tweet_df["labels"] = label_vector
|
25 |
+
|
26 |
+
# User selects model for front-end.
|
27 |
+
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
|
28 |
+
if option == "BERT":
|
29 |
+
tokenizer = bert_tokenizer
|
30 |
+
model = bert_model
|
31 |
+
else:
|
32 |
+
tokenizer = bert_tokenizer
|
33 |
+
model = tuned_model
|
34 |
+
|
35 |
# Dataset for loading tables into DataLoader
|
36 |
class ToxicityDataset(Dataset):
|
37 |
def __init__(self, dataframe, tokenizer, max_len):
|
|
|
67 |
"targets": torch.tensor(self.targets[index], dtype=torch.float),
|
68 |
}
|
69 |
|
70 |
+
# Based on user model selection, prepare Dataset and DataLoader
|
71 |
+
MAX_LENGTH = 100
|
72 |
+
TEST_BATCH_SIZE = 128
|
73 |
+
infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
|
74 |
+
infer_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
|
75 |
+
infer_loader = DataLoader(test_dataset, **test_params)
|
76 |
+
|
77 |
# Freeze model and input tokens
|
78 |
def inference():
|
79 |
model.eval()
|
80 |
final_targets = []
|
81 |
final_outputs = []
|
82 |
with torch.no_grad():
|
83 |
+
for _, data in enumerate(infer_loader, 0):
|
84 |
ids = data["ids"].to(device, dtype=torch.long)
|
85 |
mask = data["mask"].to(device, dtype=torch.long)
|
86 |
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
|
|
|
90 |
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
|
91 |
return final_outputs, final_targets
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
prediction, targets = inference()
|
94 |
prediction = np.array(prediction) >= 0.5
|
95 |
targets = np.argmax(targets, axis=1)
|