Spaces:
Running
Running
Update pages/1_TensorIntro.py
Browse files- pages/1_TensorIntro.py +42 -26
pages/1_TensorIntro.py
CHANGED
@@ -207,27 +207,44 @@ print("Normalized data:", normalized_data)
|
|
207 |
"code": '''import torch
|
208 |
import torch.nn as nn
|
209 |
import torch.optim as optim
|
210 |
-
from torchtext.
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
#
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
# Define the LSTM model
|
233 |
class LSTM(nn.Module):
|
@@ -239,12 +256,12 @@ class LSTM(nn.Module):
|
|
239 |
|
240 |
def forward(self, text, text_lengths):
|
241 |
embedded = self.embedding(text)
|
242 |
-
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
|
243 |
packed_output, (hidden, cell) = self.lstm(packed_embedded)
|
244 |
return self.fc(hidden.squeeze(0))
|
245 |
|
246 |
# Instantiate the model
|
247 |
-
INPUT_DIM = len(
|
248 |
EMBEDDING_DIM = 100
|
249 |
HIDDEN_DIM = 256
|
250 |
OUTPUT_DIM = 1
|
@@ -258,11 +275,10 @@ optimizer = optim.Adam(model.parameters())
|
|
258 |
N_EPOCHS = 5
|
259 |
for epoch in range(N_EPOCHS):
|
260 |
model.train()
|
261 |
-
for
|
262 |
optimizer.zero_grad()
|
263 |
-
text, text_lengths = batch.text
|
264 |
predictions = model(text, text_lengths).squeeze(1)
|
265 |
-
loss = criterion(predictions,
|
266 |
loss.backward()
|
267 |
optimizer.step()
|
268 |
|
|
|
207 |
"code": '''import torch
|
208 |
import torch.nn as nn
|
209 |
import torch.optim as optim
|
210 |
+
from torchtext.datasets import IMDB
|
211 |
+
from torchtext.data.utils import get_tokenizer
|
212 |
+
from torchtext.vocab import build_vocab_from_iterator
|
213 |
+
from torch.utils.data import DataLoader
|
214 |
+
from torch.nn.utils.rnn import pad_sequence
|
215 |
+
|
216 |
+
# Define the tokenizer and vocabulary
|
217 |
+
tokenizer = get_tokenizer('basic_english')
|
218 |
+
train_iter = IMDB(split='train')
|
219 |
+
|
220 |
+
def yield_tokens(data_iter):
|
221 |
+
for _, text in data_iter:
|
222 |
+
yield tokenizer(text)
|
223 |
+
|
224 |
+
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
|
225 |
+
vocab.set_default_index(vocab["<unk>"])
|
226 |
+
|
227 |
+
# Define the text and label preprocessing pipeline
|
228 |
+
text_pipeline = lambda x: vocab(tokenizer(x))
|
229 |
+
label_pipeline = lambda x: 1 if x == 'pos' else 0
|
230 |
+
|
231 |
+
# Define the collate function for the DataLoader
|
232 |
+
def collate_batch(batch):
|
233 |
+
label_list, text_list, lengths = [], [], []
|
234 |
+
for _label, _text in batch:
|
235 |
+
label_list.append(label_pipeline(_label))
|
236 |
+
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
|
237 |
+
text_list.append(processed_text)
|
238 |
+
lengths.append(processed_text.size(0))
|
239 |
+
label_list = torch.tensor(label_list, dtype=torch.float)
|
240 |
+
text_list = pad_sequence(text_list, batch_first=True)
|
241 |
+
lengths = torch.tensor(lengths, dtype=torch.int64)
|
242 |
+
return label_list, text_list, lengths
|
243 |
+
|
244 |
+
# Create DataLoaders for training and testing
|
245 |
+
train_iter, test_iter = IMDB()
|
246 |
+
train_dataloader = DataLoader(list(train_iter), batch_size=8, shuffle=True, collate_fn=collate_batch)
|
247 |
+
test_dataloader = DataLoader(list(test_iter), batch_size=8, shuffle=False, collate_fn=collate_batch)
|
248 |
|
249 |
# Define the LSTM model
|
250 |
class LSTM(nn.Module):
|
|
|
256 |
|
257 |
def forward(self, text, text_lengths):
|
258 |
embedded = self.embedding(text)
|
259 |
+
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
|
260 |
packed_output, (hidden, cell) = self.lstm(packed_embedded)
|
261 |
return self.fc(hidden.squeeze(0))
|
262 |
|
263 |
# Instantiate the model
|
264 |
+
INPUT_DIM = len(vocab)
|
265 |
EMBEDDING_DIM = 100
|
266 |
HIDDEN_DIM = 256
|
267 |
OUTPUT_DIM = 1
|
|
|
275 |
N_EPOCHS = 5
|
276 |
for epoch in range(N_EPOCHS):
|
277 |
model.train()
|
278 |
+
for labels, text, text_lengths in train_dataloader:
|
279 |
optimizer.zero_grad()
|
|
|
280 |
predictions = model(text, text_lengths).squeeze(1)
|
281 |
+
loss = criterion(predictions, labels)
|
282 |
loss.backward()
|
283 |
optimizer.step()
|
284 |
|