Spaces:

nppmatt
/

milestone-3

Runtime error

App Files Files Community

nppmatt commited on May 4, 2023

Commit

617d510

1 Parent(s): 1c2d1c1

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -21

app.py CHANGED Viewed

@@ -1,35 +1,100 @@
 import pandas as pd
 import torch
-import torch.nn.functional as TF
 import streamlit as st
-option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
-bert_path = "bert-base-uncased"
-if (option == "BERT"):
-    tokenizer = AutoTokenizer.from_pretrained(bert_path)
-    model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
-else:
-tweets_raw = pd.read_csv("train.csv", nrows=20)
 label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
-# Run encoding through model to get classification output.
-encoding = tokenizer.encode(txt, return_tensors='pt')
-result = model(encoding)
-# Transform logit to get probabilities.
-if (result.logits.size(dim=1) < 2):
-    pad = (0, 1)
-    result.logits = nn.functional.pad(result.logits, pad, "constant", 0)
-prediction = nn.functional.softmax(result.logits, dim=-1)
-neutralProb = prediction.data[0][neutralIndex]
-toxicProb = prediction.data[0][toxicIndex]
 # Write results
 st.write("Classification Probabilities")
 st.write(f"{neutralProb:.4f} - NEUTRAL")
-st.write(f"{toxicProb:.4f} - TOXIC")

+import numpy as np
 import pandas as pd
 import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer, BertModel
+from sklearn import metrics
 import streamlit as st
+class ToxicityDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_len):
+        self.tokenizer = tokenizer
+        self.data = dataframe
+        self.text = self.data.comment_text
+        self.targets = self.data.labels
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.text)
+    def __getitem__(self, index):
+        text = str(self.text[index])
+        text = " ".join(text.split())
+        inputs = self.tokenizer.encode_plus(
+            text,
+            None,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            padding="max_length",
+            truncation=True,
+            return_token_type_ids=True,
+        )
+        ids = inputs["input_ids"]
+        mask = inputs["attention_mask"]
+        token_type_ids = inputs["token_type_ids"]
+        return {
+            "ids": torch.tensor(ids, dtype=torch.long),
+            "mask": torch.tensor(mask, dtype=torch.long),
+            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
+            "targets": torch.tensor(self.targets[index], dtype=torch.float),
+        }
+def inference():
+    model.eval()
+    final_targets = []
+    final_outputs = []
+    with torch.no_grad():
+        for _, data in enumerate(testing_loader, 0):
+            ids = data["ids"].to(device, dtype=torch.long)
+            mask = data["mask"].to(device, dtype=torch.long)
+            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
+            targets = data["targets"].to(device, dtype=torch.float)
+            outputs = model(ids, mask, token_type_ids)
+            final_targets.extend(targets.cpu().detach().numpy().tolist())
+            final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
+    return final_outputs, final_targets
+bert_path = "bert-base-uncased"
+bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
+bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6)
+tuned_model = model = torch.load("pytorch_bert_toxic.bin")
+tweets_raw = pd.read_csv("test.csv", nrows=20)
+labels_raw = pd.read_csv("test_labels.csv", nrows=20)
 label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
+MAX_LENGTH = 100
+TEST_BATCH_SIZE = 128
+test_dataset = ToxicityDataset(test_df, tokenizer, MAX_LENGTH)
+test_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0}
+testing_loader = DataLoader(test_dataset, **test_params)
+option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT"))
+if option == "BERT":
+    tokenizer = bert_tokenizer
+    model = bert_model
+else:
+    tokenizer = bert_tokenizer
+    model = tuned_model
+prediction, targets = inference()
+prediction = np.array(prediction) >= 0.5
+targets = np.argmax(targets, axis=1)
+prediction = np.argmax(prediction, axis=1)
+accuracy = metrics.accuracy_score(targets, prediction)
+f1_score_micro = metrics.f1_score(targets, prediction, average="micro")
+f1_score_macro = metrics.f1_score(targets, prediction, average="macro")
+print(f"Accuracy Score = {accuracy}")
+print(f"F1 Score (Micro) = {f1_score_micro}")
+print(f"F1 Score (Macro) = {f1_score_macro}")
 # Write results
 st.write("Classification Probabilities")
 st.write(f"{neutralProb:.4f} - NEUTRAL")
+st.write(f"{toxicProb:.4f} - TOXIC")