nppmatt commited on
Commit
36ec160
1 Parent(s): ea67d67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -33
app.py CHANGED
@@ -7,6 +7,31 @@ from transformers import AutoTokenizer, BertModel
7
  from sklearn import metrics
8
  import streamlit as st
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Dataset for loading tables into DataLoader
11
  class ToxicityDataset(Dataset):
12
  def __init__(self, dataframe, tokenizer, max_len):
@@ -42,13 +67,20 @@ class ToxicityDataset(Dataset):
42
  "targets": torch.tensor(self.targets[index], dtype=torch.float),
43
  }
44
 
 
 
 
 
 
 
 
45
  # Freeze model and input tokens
46
  def inference():
47
  model.eval()
48
  final_targets = []
49
  final_outputs = []
50
  with torch.no_grad():
51
- for _, data in enumerate(testing_loader, 0):
52
  ids = data["ids"].to(device, dtype=torch.long)
53
  mask = data["mask"].to(device, dtype=torch.long)
54
  token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
@@ -58,38 +90,6 @@ def inference():
58
  final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
59
  return final_outputs, final_targets
60
 
61
- # Define models to be used
62
- bert_path = "bert-base-uncased"
63
- bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
64
- bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
65
- tuned_model = model = torch.load("pytorch_bert_toxic.bin")
66
-
67
- # Read and format data.
68
- tweets_raw = pd.read_csv("test.csv", nrows=20)
69
- labels_raw = pd.read_csv("test_labels.csv", nrows=20)
70
-
71
- label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
72
- label_vector = labels_raw[label_set].values.tolist()
73
-
74
- tweet_df = tweets_raw[["comment_text"]]
75
- tweet_df["labels"] = label_vector
76
-
77
- # User selects model for front-end.
78
- option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
79
- if option == "BERT":
80
- tokenizer = bert_tokenizer
81
- model = bert_model
82
- else:
83
- tokenizer = bert_tokenizer
84
- model = tuned_model
85
-
86
- # Based on user selection, prepare DataLoader
87
- MAX_LENGTH = 100
88
- TEST_BATCH_SIZE = 128
89
- infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
90
- infer_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
91
- testing_loader = DataLoader(test_dataset, **test_params)
92
-
93
  prediction, targets = inference()
94
  prediction = np.array(prediction) >= 0.5
95
  targets = np.argmax(targets, axis=1)
 
7
  from sklearn import metrics
8
  import streamlit as st
9
 
10
+ # Define models to be used
11
+ bert_path = "bert-base-uncased"
12
+ bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
13
+ bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
14
+ tuned_model = model = torch.load("pytorch_bert_toxic.bin")
15
+
16
+ # Read and format data.
17
+ tweets_raw = pd.read_csv("test.csv", nrows=20)
18
+ labels_raw = pd.read_csv("test_labels.csv", nrows=20)
19
+
20
+ label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
21
+ label_vector = labels_raw[label_set].values.tolist()
22
+
23
+ tweet_df = tweets_raw[["comment_text"]]
24
+ tweet_df["labels"] = label_vector
25
+
26
+ # User selects model for front-end.
27
+ option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
28
+ if option == "BERT":
29
+ tokenizer = bert_tokenizer
30
+ model = bert_model
31
+ else:
32
+ tokenizer = bert_tokenizer
33
+ model = tuned_model
34
+
35
  # Dataset for loading tables into DataLoader
36
  class ToxicityDataset(Dataset):
37
  def __init__(self, dataframe, tokenizer, max_len):
 
67
  "targets": torch.tensor(self.targets[index], dtype=torch.float),
68
  }
69
 
70
+ # Based on user model selection, prepare Dataset and DataLoader
71
+ MAX_LENGTH = 100
72
+ TEST_BATCH_SIZE = 128
73
+ infer_dataset = ToxicityDataset(tweet_df, tokenizer, MAX_LENGTH)
74
+ infer_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
75
+ infer_loader = DataLoader(test_dataset, **test_params)
76
+
77
  # Freeze model and input tokens
78
  def inference():
79
  model.eval()
80
  final_targets = []
81
  final_outputs = []
82
  with torch.no_grad():
83
+ for _, data in enumerate(infer_loader, 0):
84
  ids = data["ids"].to(device, dtype=torch.long)
85
  mask = data["mask"].to(device, dtype=torch.long)
86
  token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
 
90
  final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
91
  return final_outputs, final_targets
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  prediction, targets = inference()
94
  prediction = np.array(prediction) >= 0.5
95
  targets = np.argmax(targets, axis=1)