eaglelandsonce commited on
Commit
e84d316
·
verified ·
1 Parent(s): c4dc6bc

Rename pages/RNN.py to pages/17_RNN.py

Browse files
Files changed (1) hide show
  1. pages/{RNN.py → 17_RNN.py} +69 -32
pages/{RNN.py → 17_RNN.py} RENAMED
@@ -2,11 +2,16 @@ import streamlit as st
2
  import torch
3
  import torch.nn as nn
4
  import torch.optim as optim
5
- from torchtext.legacy import data, datasets
 
 
 
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
8
  import pandas as pd
9
  import numpy as np
 
 
10
 
11
  # Define the RNN model
12
  class RNN(nn.Module):
@@ -27,22 +32,52 @@ class RNN(nn.Module):
27
  # Function to load the data
28
  @st.cache_data
29
  def load_data():
30
- TEXT = data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm')
31
- LABEL = data.LabelField(dtype=torch.float)
32
- train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
33
- train_data, valid_data = train_data.split(split_ratio=0.8)
34
 
35
- MAX_VOCAB_SIZE = 25_000
36
- TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
37
- LABEL.build_vocab(train_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  BATCH_SIZE = 64
40
- train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
41
- (train_data, valid_data, test_data),
42
- batch_size=BATCH_SIZE,
43
- device=device)
44
-
45
- return TEXT, LABEL, train_iterator, valid_iterator, test_iterator
46
 
47
  # Function to train the network
48
  def train_network(net, iterator, optimizer, criterion, epochs):
@@ -50,10 +85,11 @@ def train_network(net, iterator, optimizer, criterion, epochs):
50
  for epoch in range(epochs):
51
  epoch_loss = 0
52
  net.train()
53
- for batch in iterator:
 
54
  optimizer.zero_grad()
55
- predictions = net(batch.text).squeeze(1)
56
- loss = criterion(predictions, batch.label)
57
  loss.backward()
58
  optimizer.step()
59
  epoch_loss += loss.item()
@@ -72,14 +108,15 @@ def evaluate_network(net, iterator, criterion):
72
  all_predictions = []
73
  net.eval()
74
  with torch.no_grad():
75
- for batch in iterator:
76
- predictions = net(batch.text).squeeze(1)
77
- loss = criterion(predictions, batch.label)
 
78
  epoch_loss += loss.item()
79
  rounded_preds = torch.round(torch.sigmoid(predictions))
80
- correct += (rounded_preds == batch.label).sum().item()
81
- total += len(batch.label)
82
- all_labels.extend(batch.label.cpu().numpy())
83
  all_predictions.extend(rounded_preds.cpu().numpy())
84
  accuracy = 100 * correct / total
85
  st.write(f'Loss: {epoch_loss / len(iterator):.4f}, Accuracy: {accuracy:.2f}%')
@@ -87,7 +124,7 @@ def evaluate_network(net, iterator, criterion):
87
 
88
  # Load the data
89
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
90
- TEXT, LABEL, train_iterator, valid_iterator, test_iterator = load_data()
91
 
92
  # Streamlit interface
93
  st.title("RNN for Text Classification on IMDb Dataset")
@@ -106,7 +143,7 @@ learning_rate = st.sidebar.slider('Learning Rate', 0.001, 0.1, 0.01, step=0.001)
106
  epochs = st.sidebar.slider('Epochs', 1, 20, 5)
107
 
108
  # Create the network
109
- vocab_size = len(TEXT.vocab)
110
  output_size = 1
111
  net = RNN(vocab_size, embed_size, hidden_size, output_size, n_layers, dropout).to(device)
112
  criterion = nn.BCEWithLogitsLoss()
@@ -117,7 +154,7 @@ st.write('\n' * 10)
117
 
118
  # Train the network
119
  if st.sidebar.button('Train Network'):
120
- loss_values = train_network(net, train_iterator, optimizer, criterion, epochs)
121
 
122
  # Plot the loss values
123
  plt.figure(figsize=(10, 5))
@@ -133,7 +170,7 @@ if st.sidebar.button('Train Network'):
133
 
134
  # Test the network
135
  if 'trained_model' in st.session_state and st.sidebar.button('Test Network'):
136
- accuracy, all_labels, all_predictions = evaluate_network(st.session_state['trained_model'], test_iterator, criterion)
137
  st.write(f'Test Accuracy: {accuracy:.2f}%')
138
 
139
  # Display results in a table
@@ -149,17 +186,17 @@ def visualize_text_predictions(iterator, net):
149
  net.eval()
150
  samples = []
151
  with torch.no_grad():
152
- for batch in iterator:
153
- predictions = torch.round(torch.sigmoid(net(batch.text).squeeze(1)))
154
- samples.extend(zip(batch.text.cpu(), batch.label.cpu(), predictions.cpu()))
155
  if len(samples) >= 10:
156
  break
157
  return samples[:10]
158
 
159
  if 'trained_model' in st.session_state and st.sidebar.button('Show Test Results'):
160
- samples = visualize_text_predictions(test_iterator, st.session_state['trained_model'])
161
  st.write('Ground Truth vs Predicted for Sample Texts')
162
  for i, (text, true_label, predicted) in enumerate(samples):
163
  st.write(f'Sample {i+1}')
164
- st.text(' '.join([TEXT.vocab.itos[token] for token in text]))
165
  st.write(f'Ground Truth: {true_label.item()}, Predicted: {predicted.item()}')
 
2
  import torch
3
  import torch.nn as nn
4
  import torch.optim as optim
5
+ from torchtext.data.utils import get_tokenizer
6
+ from torchtext.vocab import build_vocab_from_iterator, GloVe
7
+ from torchtext.datasets import IMDB
8
+ from torch.utils.data import DataLoader, random_split
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
  import pandas as pd
12
  import numpy as np
13
+ from collections import Counter
14
+ from torch.nn.utils.rnn import pad_sequence
15
 
16
  # Define the RNN model
17
  class RNN(nn.Module):
 
32
  # Function to load the data
33
  @st.cache_data
34
  def load_data():
35
+ tokenizer = get_tokenizer("basic_english")
36
+ train_iter, test_iter = IMDB(split=('train', 'test'))
 
 
37
 
38
+ def yield_tokens(data_iter):
39
+ for _, text in data_iter:
40
+ yield tokenizer(text)
41
+
42
+ vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
43
+ vocab.set_default_index(vocab["<unk>"])
44
+
45
+ # Define the text and label processing pipelines
46
+ text_pipeline = lambda x: vocab(tokenizer(x))
47
+ label_pipeline = lambda x: 1 if x == 'pos' else 0
48
+
49
+ # Process the data into tensors
50
+ def process_data(data_iter):
51
+ texts, labels = [], []
52
+ for label, text in data_iter:
53
+ texts.append(torch.tensor(text_pipeline(text), dtype=torch.long))
54
+ labels.append(label_pipeline(label))
55
+ return texts, torch.tensor(labels, dtype=torch.float)
56
+
57
+ train_texts, train_labels = process_data(train_iter)
58
+ test_texts, test_labels = process_data(test_iter)
59
+
60
+ # Create a custom collate function to pad sequences
61
+ def collate_batch(batch):
62
+ texts, labels = zip(*batch)
63
+ text_lengths = [len(text) for text in texts]
64
+ texts_padded = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
65
+ return texts_padded, torch.tensor(labels, dtype=torch.float), text_lengths
66
+
67
+ # Create DataLoaders
68
+ train_dataset = list(zip(train_texts, train_labels))
69
+ test_dataset = list(zip(test_texts, test_labels))
70
+
71
+ train_size = int(0.8 * len(train_dataset))
72
+ valid_size = len(train_dataset) - train_size
73
+ train_dataset, valid_dataset = random_split(train_dataset, [train_size, valid_size])
74
 
75
  BATCH_SIZE = 64
76
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
77
+ valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
78
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
79
+
80
+ return vocab, train_loader, valid_loader, test_loader
 
81
 
82
  # Function to train the network
83
  def train_network(net, iterator, optimizer, criterion, epochs):
 
85
  for epoch in range(epochs):
86
  epoch_loss = 0
87
  net.train()
88
+ for texts, labels, _ in iterator:
89
+ texts, labels = texts.to(device), labels.to(device)
90
  optimizer.zero_grad()
91
+ predictions = net(texts).squeeze(1)
92
+ loss = criterion(predictions, labels)
93
  loss.backward()
94
  optimizer.step()
95
  epoch_loss += loss.item()
 
108
  all_predictions = []
109
  net.eval()
110
  with torch.no_grad():
111
+ for texts, labels, _ in iterator:
112
+ texts, labels = texts.to(device), labels.to(device)
113
+ predictions = net(texts).squeeze(1)
114
+ loss = criterion(predictions, labels)
115
  epoch_loss += loss.item()
116
  rounded_preds = torch.round(torch.sigmoid(predictions))
117
+ correct += (rounded_preds == labels).sum().item()
118
+ total += len(labels)
119
+ all_labels.extend(labels.cpu().numpy())
120
  all_predictions.extend(rounded_preds.cpu().numpy())
121
  accuracy = 100 * correct / total
122
  st.write(f'Loss: {epoch_loss / len(iterator):.4f}, Accuracy: {accuracy:.2f}%')
 
124
 
125
  # Load the data
126
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
127
+ vocab, train_loader, valid_loader, test_loader = load_data()
128
 
129
  # Streamlit interface
130
  st.title("RNN for Text Classification on IMDb Dataset")
 
143
  epochs = st.sidebar.slider('Epochs', 1, 20, 5)
144
 
145
  # Create the network
146
+ vocab_size = len(vocab)
147
  output_size = 1
148
  net = RNN(vocab_size, embed_size, hidden_size, output_size, n_layers, dropout).to(device)
149
  criterion = nn.BCEWithLogitsLoss()
 
154
 
155
  # Train the network
156
  if st.sidebar.button('Train Network'):
157
+ loss_values = train_network(net, train_loader, optimizer, criterion, epochs)
158
 
159
  # Plot the loss values
160
  plt.figure(figsize=(10, 5))
 
170
 
171
  # Test the network
172
  if 'trained_model' in st.session_state and st.sidebar.button('Test Network'):
173
+ accuracy, all_labels, all_predictions = evaluate_network(st.session_state['trained_model'], test_loader, criterion)
174
  st.write(f'Test Accuracy: {accuracy:.2f}%')
175
 
176
  # Display results in a table
 
186
  net.eval()
187
  samples = []
188
  with torch.no_grad():
189
+ for texts, labels, _ in iterator:
190
+ predictions = torch.round(torch.sigmoid(net(texts).squeeze(1)))
191
+ samples.extend(zip(texts.cpu(), labels.cpu(), predictions.cpu()))
192
  if len(samples) >= 10:
193
  break
194
  return samples[:10]
195
 
196
  if 'trained_model' in st.session_state and st.sidebar.button('Show Test Results'):
197
+ samples = visualize_text_predictions(test_loader, st.session_state['trained_model'])
198
  st.write('Ground Truth vs Predicted for Sample Texts')
199
  for i, (text, true_label, predicted) in enumerate(samples):
200
  st.write(f'Sample {i+1}')
201
+ st.text(' '.join([vocab.itos[token] for token in text]))
202
  st.write(f'Ground Truth: {true_label.item()}, Predicted: {predicted.item()}')