nppmatt commited on
Commit
617d510
1 Parent(s): 1c2d1c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -21
app.py CHANGED
@@ -1,35 +1,100 @@
 
1
  import pandas as pd
2
  import torch
3
- import torch.nn.functional as TF
 
 
 
4
  import streamlit as st
5
 
6
- option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
 
 
 
 
 
 
7
 
8
- bert_path = "bert-base-uncased"
9
- if (option == "BERT"):
10
-
11
- tokenizer = AutoTokenizer.from_pretrained(bert_path)
12
- model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
13
- else:
14
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- tweets_raw = pd.read_csv("train.csv", nrows=20)
 
 
 
17
 
 
 
18
  label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
19
 
20
- # Run encoding through model to get classification output.
21
- encoding = tokenizer.encode(txt, return_tensors='pt')
22
- result = model(encoding)
23
 
24
- # Transform logit to get probabilities.
25
- if (result.logits.size(dim=1) < 2):
26
- pad = (0, 1)
27
- result.logits = nn.functional.pad(result.logits, pad, "constant", 0)
28
- prediction = nn.functional.softmax(result.logits, dim=-1)
29
- neutralProb = prediction.data[0][neutralIndex]
30
- toxicProb = prediction.data[0][toxicIndex]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Write results
33
  st.write("Classification Probabilities")
34
  st.write(f"{neutralProb:.4f} - NEUTRAL")
35
- st.write(f"{toxicProb:.4f} - TOXIC")
 
1
+ import numpy as np
2
  import pandas as pd
3
  import torch
4
+ from torch import nn
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from transformers import AutoTokenizer, BertModel
7
+ from sklearn import metrics
8
  import streamlit as st
9
 
10
+ class ToxicityDataset(Dataset):
11
+ def __init__(self, dataframe, tokenizer, max_len):
12
+ self.tokenizer = tokenizer
13
+ self.data = dataframe
14
+ self.text = self.data.comment_text
15
+ self.targets = self.data.labels
16
+ self.max_len = max_len
17
 
18
+ def __len__(self):
19
+ return len(self.text)
20
+
21
+ def __getitem__(self, index):
22
+ text = str(self.text[index])
23
+ text = " ".join(text.split())
24
+
25
+ inputs = self.tokenizer.encode_plus(
26
+ text,
27
+ None,
28
+ add_special_tokens=True,
29
+ max_length=self.max_len,
30
+ padding="max_length",
31
+ truncation=True,
32
+ return_token_type_ids=True,
33
+ )
34
+ ids = inputs["input_ids"]
35
+ mask = inputs["attention_mask"]
36
+ token_type_ids = inputs["token_type_ids"]
37
+ return {
38
+ "ids": torch.tensor(ids, dtype=torch.long),
39
+ "mask": torch.tensor(mask, dtype=torch.long),
40
+ "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
41
+ "targets": torch.tensor(self.targets[index], dtype=torch.float),
42
+ }
43
+
44
+
45
+ def inference():
46
+ model.eval()
47
+ final_targets = []
48
+ final_outputs = []
49
+ with torch.no_grad():
50
+ for _, data in enumerate(testing_loader, 0):
51
+ ids = data["ids"].to(device, dtype=torch.long)
52
+ mask = data["mask"].to(device, dtype=torch.long)
53
+ token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
54
+ targets = data["targets"].to(device, dtype=torch.float)
55
+ outputs = model(ids, mask, token_type_ids)
56
+ final_targets.extend(targets.cpu().detach().numpy().tolist())
57
+ final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
58
+ return final_outputs, final_targets
59
 
60
+ bert_path = "bert-base-uncased"
61
+ bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
62
+ bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
63
+ tuned_model = model = torch.load("pytorch_bert_toxic.bin")
64
 
65
+ tweets_raw = pd.read_csv("test.csv", nrows=20)
66
+ labels_raw = pd.read_csv("test_labels.csv", nrows=20)
67
  label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
68
 
 
 
 
69
 
70
+ MAX_LENGTH = 100
71
+ TEST_BATCH_SIZE = 128
72
+ test_dataset = ToxicityDataset(test_df, tokenizer, MAX_LENGTH)
73
+ test_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
74
+ testing_loader = DataLoader(test_dataset, **test_params)
75
+
76
+ option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
77
+ if option == "BERT":
78
+ tokenizer = bert_tokenizer
79
+ model = bert_model
80
+ else:
81
+ tokenizer = bert_tokenizer
82
+ model = tuned_model
83
+
84
+ prediction, targets = inference()
85
+ prediction = np.array(prediction) >= 0.5
86
+
87
+ targets = np.argmax(targets, axis=1)
88
+ prediction = np.argmax(prediction, axis=1)
89
+ accuracy = metrics.accuracy_score(targets, prediction)
90
+ f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
91
+ f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
92
+
93
+ print(f"Accuracy Score = {accuracy}")
94
+ print(f"F1 Score (Micro) = {f1_score_micro}")
95
+ print(f"F1 Score (Macro) = {f1_score_macro}")
96
 
97
  # Write results
98
  st.write("Classification Probabilities")
99
  st.write(f"{neutralProb:.4f} - NEUTRAL")
100
+ st.write(f"{toxicProb:.4f} - TOXIC")