eaglelandsonce commited on
Commit
c4eca14
·
verified ·
1 Parent(s): f50c8e1

Update pages/21_NLP_Transformer.py

Browse files
Files changed (1) hide show
  1. pages/21_NLP_Transformer.py +33 -193
pages/21_NLP_Transformer.py CHANGED
@@ -1,199 +1,39 @@
1
- import torch
2
- from torch.utils.data import DataLoader, Dataset
3
- from transformers import BertTokenizer, BertForSequenceClassification, AdamW
4
- from transformers import get_linear_schedule_with_warmup
5
- import numpy as np
6
- from datasets import load_dataset
7
  import streamlit as st
 
 
 
8
 
9
- # Load IMDb dataset
10
- dataset = load_dataset('imdb')
11
- train_df = dataset['train'].to_pandas()
12
- test_df = dataset['test'].to_pandas()
13
-
14
- # Preprocess the data
15
- train_df = train_df[['text', 'label']]
16
- test_df = test_df[['text', 'label']]
17
-
18
- class SentimentDataset(Dataset):
19
- def __init__(self, dataframe, tokenizer, max_len):
20
- self.tokenizer = tokenizer
21
- self.data = dataframe
22
- self.max_len = max_len
23
-
24
- def __len__(self):
25
- return len(self.data)
26
-
27
- def __getitem__(self, index):
28
- review = str(self.data.iloc[index, 0])
29
- label = self.data.iloc[index, 1]
30
-
31
- encoding = self.tokenizer.encode_plus(
32
- review,
33
- add_special_tokens=True,
34
- max_length=self.max_len,
35
- return_token_type_ids=False,
36
- pad_to_max_length=True,
37
- return_attention_mask=True,
38
- return_tensors='pt',
39
- )
40
-
41
- return {
42
- 'review_text': review,
43
- 'input_ids': encoding['input_ids'].flatten(),
44
- 'attention_mask': encoding['attention_mask'].flatten(),
45
- 'labels': torch.tensor(label, dtype=torch.long)
46
- }
47
-
48
- def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
49
- model = model.train()
50
- losses = []
51
- correct_predictions = 0
52
-
53
- for d in data_loader:
54
- input_ids = d["input_ids"].to(device)
55
- attention_mask = d["attention_mask"].to(device)
56
- labels = d["labels"].to(device)
57
-
58
- outputs = model(
59
- input_ids=input_ids,
60
- attention_mask=attention_mask
61
- )
62
-
63
- loss = loss_fn(outputs.logits, labels)
64
- correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
65
- losses.append(loss.item())
66
-
67
- loss.backward()
68
- optimizer.step()
69
- scheduler.step()
70
- optimizer.zero_grad()
71
-
72
- return correct_predictions.double() / n_examples, np.mean(losses)
73
-
74
- def eval_model(model, data_loader, loss_fn, device, n_examples):
75
- model = model.eval()
76
- losses = []
77
- correct_predictions = 0
78
-
79
- with torch.no_grad():
80
- for d in data_loader:
81
- input_ids = d["input_ids"].to(device)
82
- attention_mask = d["attention_mask"].to(device)
83
- labels = d["labels"].to(device)
84
-
85
- outputs = model(
86
- input_ids=input_ids,
87
- attention_mask=attention_mask
88
- )
89
-
90
- loss = loss_fn(outputs.logits, labels)
91
- correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
92
- losses.append(loss.item())
93
-
94
- return correct_predictions.double() / n_examples, np.mean(losses)
95
-
96
- def create_data_loader(df, tokenizer, max_len, batch_size):
97
- ds = SentimentDataset(
98
- dataframe=df,
99
- tokenizer=tokenizer,
100
- max_len=max_len
101
- )
102
-
103
- return DataLoader(
104
- ds,
105
- batch_size=batch_size,
106
- num_workers=4
107
- )
108
-
109
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
110
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
111
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
112
-
113
- # Create data loaders
114
- BATCH_SIZE = 16
115
- MAX_LEN = 128
116
-
117
- train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
118
- test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
119
-
120
- EPOCHS = 2
121
- optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
122
- total_steps = len(train_data_loader) * EPOCHS
123
- scheduler = get_linear_schedule_with_warmup(
124
- optimizer,
125
- num_warmup_steps=0,
126
- num_training_steps=total_steps
127
- )
128
- loss_fn = torch.nn.CrossEntropyLoss().to(device)
129
- model = model.to(device)
130
-
131
- # Streamlit app
132
- st.title("Sentiment Analysis with BERT")
133
- st.write("""
134
- This application allows you to train a BERT model for sentiment analysis on the IMDb dataset.
135
- You can input a movie review and the model will predict whether the sentiment is positive or negative.
136
- """)
137
-
138
- if st.button("Train Model"):
139
- with st.spinner("Training the model..."):
140
- # Training loop
141
- for epoch in range(EPOCHS):
142
- train_acc, train_loss = train_epoch(
143
- model,
144
- train_data_loader,
145
- loss_fn,
146
- optimizer,
147
- device,
148
- scheduler,
149
- len(train_df)
150
- )
151
-
152
- st.write(f'Epoch {epoch + 1}/{EPOCHS}')
153
- st.write(f'Train loss {train_loss} accuracy {train_acc}')
154
-
155
- val_acc, val_loss = eval_model(
156
- model,
157
- test_data_loader,
158
- loss_fn,
159
- device,
160
- len(test_df)
161
- )
162
-
163
- st.write(f'Val loss {val_loss} accuracy {val_acc}')
164
-
165
- # Save the model
166
- model.save_pretrained('bert-sentiment-model')
167
- tokenizer.save_pretrained('bert-sentiment-model')
168
- st.success("Model training complete!")
169
-
170
- model = BertForSequenceClassification.from_pretrained('bert-sentiment-model')
171
- tokenizer = BertTokenizer.from_pretrained('bert-sentiment-model')
172
- model = model.eval()
173
-
174
- def predict_sentiment(text):
175
- encoding = tokenizer.encode_plus(
176
- text,
177
- add_special_tokens=True,
178
- max_length=128,
179
- return_token_type_ids=False,
180
- pad_to_max_length=True,
181
- return_attention_mask=True,
182
- return_tensors='pt',
183
- )
184
- input_ids = encoding['input_ids']
185
- attention_mask = encoding['attention_mask']
186
 
187
- with torch.no_grad():
188
- outputs = model(input_ids, attention_mask=attention_mask)
189
- probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
190
- predicted_class = torch.argmax(probabilities, dim=1).item()
191
-
192
- return 'positive' if predicted_class == 1 else 'negative'
193
 
194
- st.title("Sentiment Analysis with BERT")
195
- user_input = st.text_area("Enter a movie review:")
196
 
197
  if st.button("Analyze"):
198
- sentiment = predict_sentiment(user_input)
199
- st.write(f'The sentiment of the review is: **{sentiment}**')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
4
+ import matplotlib.pyplot as plt
5
 
6
+ # Load model and tokenizer
7
+ model_name = "distilbert-base-uncased-finetuned-sst-2-english"
8
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # Streamlit interface
13
+ st.title("Sentiment Analysis with Hugging Face Transformers")
14
+ st.write("Enter text to analyze its sentiment:")
 
 
 
15
 
16
+ input_text = st.text_area("Input Text", height=200)
 
17
 
18
  if st.button("Analyze"):
19
+ if input_text:
20
+ # Perform sentiment analysis
21
+ results = classifier(input_text)
22
+
23
+ # Display results
24
+ st.write("Results:")
25
+ st.write(results)
26
+
27
+ # Extract scores for plotting
28
+ scores = results[0]['score']
29
+ labels = results[0]['label']
30
+
31
+ # Plotting
32
+ fig, ax = plt.subplots()
33
+ ax.bar(labels, scores, color='skyblue')
34
+ ax.set_ylabel('Score')
35
+ ax.set_title('Sentiment Analysis Result')
36
+
37
+ st.pyplot(fig)
38
+ else:
39
+ st.write("Please enter text to analyze.")