eaglelandsonce commited on
Commit
3c7fa59
·
verified ·
1 Parent(s): 8cb7c7e

Update pages/1_TensorIntro.py

Browse files
Files changed (1) hide show
  1. pages/1_TensorIntro.py +42 -26
pages/1_TensorIntro.py CHANGED
@@ -207,27 +207,44 @@ print("Normalized data:", normalized_data)
207
  "code": '''import torch
208
  import torch.nn as nn
209
  import torch.optim as optim
210
- from torchtext.legacy import data, datasets
211
-
212
- # Define the fields for the dataset
213
- TEXT = data.Field(tokenize='spacy', include_lengths=True)
214
- LABEL = data.LabelField(dtype=torch.float)
215
-
216
- # Load the IMDb dataset
217
- train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
218
-
219
- # Build the vocabulary
220
- TEXT.build_vocab(train_data, max_size=25000)
221
- LABEL.build_vocab(train_data)
222
-
223
- # Create the iterators
224
- BATCH_SIZE = 64
225
- train_iterator, test_iterator = data.BucketIterator.splits(
226
- (train_data, test_data),
227
- batch_size=BATCH_SIZE,
228
- sort_within_batch=True,
229
- device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
230
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  # Define the LSTM model
233
  class LSTM(nn.Module):
@@ -239,12 +256,12 @@ class LSTM(nn.Module):
239
 
240
  def forward(self, text, text_lengths):
241
  embedded = self.embedding(text)
242
- packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
243
  packed_output, (hidden, cell) = self.lstm(packed_embedded)
244
  return self.fc(hidden.squeeze(0))
245
 
246
  # Instantiate the model
247
- INPUT_DIM = len(TEXT.vocab)
248
  EMBEDDING_DIM = 100
249
  HIDDEN_DIM = 256
250
  OUTPUT_DIM = 1
@@ -258,11 +275,10 @@ optimizer = optim.Adam(model.parameters())
258
  N_EPOCHS = 5
259
  for epoch in range(N_EPOCHS):
260
  model.train()
261
- for batch in train_iterator:
262
  optimizer.zero_grad()
263
- text, text_lengths = batch.text
264
  predictions = model(text, text_lengths).squeeze(1)
265
- loss = criterion(predictions, batch.label)
266
  loss.backward()
267
  optimizer.step()
268
 
 
207
  "code": '''import torch
208
  import torch.nn as nn
209
  import torch.optim as optim
210
+ from torchtext.datasets import IMDB
211
+ from torchtext.data.utils import get_tokenizer
212
+ from torchtext.vocab import build_vocab_from_iterator
213
+ from torch.utils.data import DataLoader
214
+ from torch.nn.utils.rnn import pad_sequence
215
+
216
+ # Define the tokenizer and vocabulary
217
+ tokenizer = get_tokenizer('basic_english')
218
+ train_iter = IMDB(split='train')
219
+
220
+ def yield_tokens(data_iter):
221
+ for _, text in data_iter:
222
+ yield tokenizer(text)
223
+
224
+ vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
225
+ vocab.set_default_index(vocab["<unk>"])
226
+
227
+ # Define the text and label preprocessing pipeline
228
+ text_pipeline = lambda x: vocab(tokenizer(x))
229
+ label_pipeline = lambda x: 1 if x == 'pos' else 0
230
+
231
+ # Define the collate function for the DataLoader
232
+ def collate_batch(batch):
233
+ label_list, text_list, lengths = [], [], []
234
+ for _label, _text in batch:
235
+ label_list.append(label_pipeline(_label))
236
+ processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
237
+ text_list.append(processed_text)
238
+ lengths.append(processed_text.size(0))
239
+ label_list = torch.tensor(label_list, dtype=torch.float)
240
+ text_list = pad_sequence(text_list, batch_first=True)
241
+ lengths = torch.tensor(lengths, dtype=torch.int64)
242
+ return label_list, text_list, lengths
243
+
244
+ # Create DataLoaders for training and testing
245
+ train_iter, test_iter = IMDB()
246
+ train_dataloader = DataLoader(list(train_iter), batch_size=8, shuffle=True, collate_fn=collate_batch)
247
+ test_dataloader = DataLoader(list(test_iter), batch_size=8, shuffle=False, collate_fn=collate_batch)
248
 
249
  # Define the LSTM model
250
  class LSTM(nn.Module):
 
256
 
257
  def forward(self, text, text_lengths):
258
  embedded = self.embedding(text)
259
+ packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
260
  packed_output, (hidden, cell) = self.lstm(packed_embedded)
261
  return self.fc(hidden.squeeze(0))
262
 
263
  # Instantiate the model
264
+ INPUT_DIM = len(vocab)
265
  EMBEDDING_DIM = 100
266
  HIDDEN_DIM = 256
267
  OUTPUT_DIM = 1
 
275
  N_EPOCHS = 5
276
  for epoch in range(N_EPOCHS):
277
  model.train()
278
+ for labels, text, text_lengths in train_dataloader:
279
  optimizer.zero_grad()
 
280
  predictions = model(text, text_lengths).squeeze(1)
281
+ loss = criterion(predictions, labels)
282
  loss.backward()
283
  optimizer.step()
284