cgr28 commited on
Commit
8fdaf9e
·
1 Parent(s): c073d3a

milestone-3

Browse files
Files changed (2) hide show
  1. app.py +22 -21
  2. milestone_3.py → train.py +5 -6
app.py CHANGED
@@ -1,30 +1,31 @@
1
  import streamlit as st
2
- from transformers import AutoTokenizer, RobertaForSequenceClassification
3
  import numpy as np
4
  import torch
 
 
5
 
6
- # assignment 2
7
- st.title("CS482 Project Sentiment Analysis")
 
 
 
 
 
8
 
9
- text = st.text_area(label="Text to be analyzed", value="This sentiment analysis app is great!")
10
 
11
- selected_model = st.radio(label="Model", options=["Model 1", "Model 2"])
 
 
 
 
 
 
12
 
13
- analyze_button = st.button(label="Analyze")
14
 
15
- st.markdown("**:red[Sentiment:]**")
 
16
 
17
- with st.spinner(text="Analyzing..."):
18
- if analyze_button:
19
- if selected_model=="Model 1":
20
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
21
- model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
22
- else:
23
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
24
- model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
25
- inputs = tokenizer(text, return_tensors="pt")
26
- with torch.no_grad():
27
- logits = model(**inputs).logits
28
- prediction_id = logits.argmax().item()
29
- results = model.config.id2label[prediction_id]
30
- st.write(results)
 
1
  import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
3
  import numpy as np
4
  import torch
5
+ import pandas as pd
6
+ import torch.nn.functional as F
7
 
8
+ model_name = "unitary/toxic-bert"
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
12
+
13
+
14
+ df = pd.DataFrame(columns=("Tweet", "Toxicity", "Probability"))
15
 
16
+ sample_tweets = ["Ask Sityush to clean up his behavior than issue me nonsensical warnings...", "be a man and lets discuss it-maybe over the phone?", "Don't look, come or think of comming back! Tosser."]
17
 
18
+ classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
19
+ results = classifier(sample_tweets)
20
+
21
+ batch = tokenizer(sample_tweets, padding=True, truncation=True, max_length=512, return_tensors="pt")
22
+
23
+ # assignment 3
24
+ st.title("CS482 Project Sentiment Analysis")
25
 
26
+ st.markdown("**:red[unitary/toxic-bert]**")
27
 
28
+ for i in range(len(sample_tweets)):
29
+ df.loc[len(df.index)] = [sample_tweets[i], results[i]["label"], results[i]["score"]]
30
 
31
+ st.table(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
milestone_3.py → train.py RENAMED
@@ -1,4 +1,4 @@
1
- from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
2
  import torch
3
  from torch.utils.data import Dataset
4
  # from torch.optim import AdamW
@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
7
 
8
 
9
  # assignment 3
10
- model_name = "distilbert-base-uncased"
11
 
12
  class ToxicDataset(Dataset):
13
 
@@ -18,7 +18,6 @@ class ToxicDataset(Dataset):
18
  def __getitem__(self, idx):
19
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
20
  item["labels"] = torch.tensor(self.labels[idx])
21
- print(item)
22
  return item
23
 
24
  def __len__(self):
@@ -35,7 +34,7 @@ train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.t
35
 
36
 
37
  print("Data split. Tokenizing data...")
38
- tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
39
 
40
  train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
41
  val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
@@ -59,7 +58,7 @@ training_args = TrainingArguments(
59
 
60
  # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
61
 
62
- model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)
63
 
64
  trainer = Trainer(
65
  model=model,
@@ -101,7 +100,7 @@ trainer.train()
101
 
102
  print("Training complete. Saving model...")
103
 
104
- save_directory = ".results/model"
105
  model.save_pretrained(save_directory)
106
 
107
  print("Model saved.")
 
1
+ from transformers import BertTokenizerFast, BertModel, Trainer, TrainingArguments
2
  import torch
3
  from torch.utils.data import Dataset
4
  # from torch.optim import AdamW
 
7
 
8
 
9
  # assignment 3
10
+ model_name = "bert-base-uncased"
11
 
12
  class ToxicDataset(Dataset):
13
 
 
18
  def __getitem__(self, idx):
19
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
20
  item["labels"] = torch.tensor(self.labels[idx])
 
21
  return item
22
 
23
  def __len__(self):
 
34
 
35
 
36
  print("Data split. Tokenizing data...")
37
+ tokenizer = BertTokenizerFast.from_pretrained(model_name)
38
 
39
  train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
40
  val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
 
58
 
59
  # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
60
 
61
+ model = BertModel.from_pretrained(model_name, num_labels=6)
62
 
63
  trainer = Trainer(
64
  model=model,
 
100
 
101
  print("Training complete. Saving model...")
102
 
103
+ save_directory = "./results/model"
104
  model.save_pretrained(save_directory)
105
 
106
  print("Model saved.")