eaglelandsonce commited on
Commit
ad12f7d
·
verified ·
1 Parent(s): 741a2c4

Update pages/21_NLP_Transformer.py

Browse files
Files changed (1) hide show
  1. pages/21_NLP_Transformer.py +184 -77
pages/21_NLP_Transformer.py CHANGED
@@ -1,90 +1,197 @@
 
 
1
  import torch
2
- from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
3
- from datasets import load_dataset
4
- from torch.utils.data import DataLoader
 
 
5
  import streamlit as st
6
- import matplotlib.pyplot as plt
7
- from tqdm.auto import tqdm
8
 
9
- # Load pre-trained model and tokenizer from Hugging Face
10
- tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
11
- model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
12
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
13
- model.to(device)
14
 
15
- # Streamlit interface
16
- st.title("Sentiment Analysis with BERT")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Training setup
19
- st.sidebar.title("Training Setup")
20
- num_epochs = st.sidebar.slider("Number of Epochs", 1, 5, 3)
21
- batch_size = st.sidebar.slider("Batch Size", 4, 32, 8)
22
- learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 5e-5, format="%.6f")
23
 
24
- # Define a custom hash function for AddedToken type
25
- @st.cache_data(hash_funcs={tokenizer.__class__: id})
26
- def load_and_preprocess_data():
27
- dataset = load_dataset("imdb", split="train[:1%]")
28
- def preprocess_function(examples):
29
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
30
- encoded_dataset = dataset.map(preprocess_function, batched=True)
31
- encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
32
- encoded_dataset = encoded_dataset.rename_column("label", "labels") # Rename the column to 'labels'
33
- return DataLoader(encoded_dataset, shuffle=True, batch_size=batch_size)
34
 
35
- train_dataloader = load_and_preprocess_data()
36
 
37
- # Initialize training status
38
- training_completed = st.sidebar.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Training loop
41
- if st.sidebar.button("Train"):
42
- optimizer = AdamW(model.parameters(), lr=learning_rate)
43
- num_training_steps = num_epochs * len(train_dataloader)
44
- lr_scheduler = get_scheduler(
45
- name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
 
 
 
 
46
  )
47
 
48
- progress_bar = tqdm(range(num_training_steps))
49
- loss_values = []
50
-
51
- model.train()
52
- for epoch in range(num_epochs):
53
- for batch in train_dataloader:
54
- batch = {k: v.to(device) for k, v in batch.items()}
55
- outputs = model(**batch)
56
- loss = outputs.loss
57
- loss.backward()
58
-
59
- optimizer.step()
60
- lr_scheduler.step()
61
- optimizer.zero_grad()
62
- progress_bar.update(1)
63
- loss_values.append(loss.item())
64
-
65
- training_completed.success("Training completed")
66
-
67
- # Plot loss values
68
- st.write("### Training Loss")
69
- plt.figure(figsize=(10, 6))
70
- plt.plot(loss_values, label="Training Loss")
71
- plt.xlabel("Training Steps")
72
- plt.ylabel("Loss")
73
- plt.legend()
74
- st.pyplot(plt)
75
-
76
- # Text input for prediction
77
- st.write("### Predict Sentiment")
78
- user_input = st.text_area("Enter text:", "I loved this movie!")
79
-
80
- if user_input:
81
- inputs = tokenizer(user_input, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
82
- inputs = {k: v.to(device) for k, v in inputs.items()}
83
-
84
- model.eval()
85
  with torch.no_grad():
86
- outputs = model(**inputs)
87
- prediction = outputs.logits.argmax(dim=-1).item()
88
- sentiment = "Positive" if prediction == 1 else "Negative"
89
-
90
- st.write(f"Sentiment: **{sentiment}**")
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
  import torch
4
+ from torch.utils.data import DataLoader, Dataset
5
+ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
6
+ from transformers import get_linear_schedule_with_warmup
7
+ import numpy as np
8
+ from sklearn.metrics import accuracy_score, classification_report
9
  import streamlit as st
 
 
10
 
11
+ # Load and preprocess the IMDb dataset
12
+ data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
13
+ df = pd.read_csv(data_url)
 
 
14
 
15
+ df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
16
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
17
+
18
+ train_df.to_csv('train.csv', index=False)
19
+ test_df.to_csv('test.csv', index=False)
20
+
21
+ class SentimentDataset(Dataset):
22
+ def __init__(self, dataframe, tokenizer, max_len):
23
+ self.tokenizer = tokenizer
24
+ self.data = dataframe
25
+ self.max_len = max_len
26
+
27
+ def __len__(self):
28
+ return len(self.data)
29
+
30
+ def __getitem__(self, index):
31
+ review = str(self.data.iloc[index, 0])
32
+ label = self.data.iloc[index, 1]
33
+
34
+ encoding = self.tokenizer.encode_plus(
35
+ review,
36
+ add_special_tokens=True,
37
+ max_length=self.max_len,
38
+ return_token_type_ids=False,
39
+ pad_to_max_length=True,
40
+ return_attention_mask=True,
41
+ return_tensors='pt',
42
+ )
43
+
44
+ return {
45
+ 'review_text': review,
46
+ 'input_ids': encoding['input_ids'].flatten(),
47
+ 'attention_mask': encoding['attention_mask'].flatten(),
48
+ 'labels': torch.tensor(label, dtype=torch.long)
49
+ }
50
+
51
+ def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
52
+ model = model.train()
53
+ losses = []
54
+ correct_predictions = 0
55
+
56
+ for d in data_loader:
57
+ input_ids = d["input_ids"].to(device)
58
+ attention_mask = d["attention_mask"].to(device)
59
+ labels = d["labels"].to(device)
60
+
61
+ outputs = model(
62
+ input_ids=input_ids,
63
+ attention_mask=attention_mask
64
+ )
65
 
66
+ loss = loss_fn(outputs.logits, labels)
67
+ correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
68
+ losses.append(loss.item())
 
 
69
 
70
+ loss.backward()
71
+ optimizer.step()
72
+ scheduler.step()
73
+ optimizer.zero_grad()
 
 
 
 
 
 
74
 
75
+ return correct_predictions.double() / n_examples, np.mean(losses)
76
 
77
+ def eval_model(model, data_loader, loss_fn, device, n_examples):
78
+ model = model.eval()
79
+ losses = []
80
+ correct_predictions = 0
81
+
82
+ with torch.no_grad():
83
+ for d in data_loader:
84
+ input_ids = d["input_ids"].to(device)
85
+ attention_mask = d["attention_mask"].to(device)
86
+ labels = d["labels"].to(device)
87
+
88
+ outputs = model(
89
+ input_ids=input_ids,
90
+ attention_mask=attention_mask
91
+ )
92
+
93
+ loss = loss_fn(outputs.logits, labels)
94
+ correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
95
+ losses.append(loss.item())
96
+
97
+ return correct_predictions.double() / n_examples, np.mean(losses)
98
+
99
+ def create_data_loader(df, tokenizer, max_len, batch_size):
100
+ ds = SentimentDataset(
101
+ dataframe=df,
102
+ tokenizer=tokenizer,
103
+ max_len=max_len
104
+ )
105
+
106
+ return DataLoader(
107
+ ds,
108
+ batch_size=batch_size,
109
+ num_workers=4
110
+ )
111
+
112
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
113
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
114
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
115
+
116
+ # Load data
117
+ train_df = pd.read_csv('train.csv')
118
+ test_df = pd.read_csv('test.csv')
119
+
120
+ # Create data loaders
121
+ BATCH_SIZE = 16
122
+ MAX_LEN = 128
123
+
124
+ train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
125
+ test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
126
+
127
+ EPOCHS = 2
128
+ optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
129
+ total_steps = len(train_data_loader) * EPOCHS
130
+ scheduler = get_linear_schedule_with_warmup(
131
+ optimizer,
132
+ num_warmup_steps=0,
133
+ num_training_steps=total_steps
134
+ )
135
+ loss_fn = torch.nn.CrossEntropyLoss().to(device)
136
+ model = model.to(device)
137
 
138
  # Training loop
139
+ for epoch in range(EPOCHS):
140
+ train_acc, train_loss = train_epoch(
141
+ model,
142
+ train_data_loader,
143
+ loss_fn,
144
+ optimizer,
145
+ device,
146
+ scheduler,
147
+ len(train_df)
148
  )
149
 
150
+ print(f'Epoch {epoch + 1}/{EPOCHS}')
151
+ print(f'Train loss {train_loss} accuracy {train_acc}')
152
+
153
+ val_acc, val_loss = eval_model(
154
+ model,
155
+ test_data_loader,
156
+ loss_fn,
157
+ device,
158
+ len(test_df)
159
+ )
160
+
161
+ print(f'Val loss {val_loss} accuracy {val_acc}')
162
+
163
+ # Save the model
164
+ model.save_pretrained('bert-sentiment-model')
165
+ tokenizer.save_pretrained('bert-sentiment-model')
166
+
167
+ # Streamlit app
168
+ model = BertForSequenceClassification.from_pretrained('bert-sentiment-model')
169
+ tokenizer = BertTokenizer.from_pretrained('bert-sentiment-model')
170
+ model = model.eval()
171
+
172
+ def predict_sentiment(text):
173
+ encoding = tokenizer.encode_plus(
174
+ text,
175
+ add_special_tokens=True,
176
+ max_length=128,
177
+ return_token_type_ids=False,
178
+ pad_to_max_length=True,
179
+ return_attention_mask=True,
180
+ return_tensors='pt',
181
+ )
182
+ input_ids = encoding['input_ids']
183
+ attention_mask = encoding['attention_mask']
184
+
 
 
185
  with torch.no_grad():
186
+ outputs = model(input_ids, attention_mask=attention_mask)
187
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
188
+ predicted_class = torch.argmax(probabilities, dim=1).item()
189
+
190
+ return 'positive' if predicted_class == 1 else 'negative'
191
+
192
+ st.title("Sentiment Analysis with BERT")
193
+ user_input = st.text_area("Enter a movie review:")
194
+
195
+ if st.button("Analyze"):
196
+ sentiment = predict_sentiment(user_input)
197
+ st.write(f'The sentiment of the review is: **{sentiment}**')