eaglelandsonce commited on
Commit
6258c70
·
verified ·
1 Parent(s): ad12f7d

Update pages/21_NLP_Transformer.py

Browse files
Files changed (1) hide show
  1. pages/21_NLP_Transformer.py +8 -15
pages/21_NLP_Transformer.py CHANGED
@@ -1,22 +1,19 @@
1
- import pandas as pd
2
- from sklearn.model_selection import train_test_split
3
  import torch
4
  from torch.utils.data import DataLoader, Dataset
5
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
6
  from transformers import get_linear_schedule_with_warmup
7
  import numpy as np
8
- from sklearn.metrics import accuracy_score, classification_report
9
  import streamlit as st
10
 
11
- # Load and preprocess the IMDb dataset
12
- data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
13
- df = pd.read_csv(data_url)
 
14
 
15
- df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
16
- train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
17
-
18
- train_df.to_csv('train.csv', index=False)
19
- test_df.to_csv('test.csv', index=False)
20
 
21
  class SentimentDataset(Dataset):
22
  def __init__(self, dataframe, tokenizer, max_len):
@@ -113,10 +110,6 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
113
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
114
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
115
 
116
- # Load data
117
- train_df = pd.read_csv('train.csv')
118
- test_df = pd.read_csv('test.csv')
119
-
120
  # Create data loaders
121
  BATCH_SIZE = 16
122
  MAX_LEN = 128
 
 
 
1
  import torch
2
  from torch.utils.data import DataLoader, Dataset
3
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
4
  from transformers import get_linear_schedule_with_warmup
5
  import numpy as np
6
+ from datasets import load_dataset
7
  import streamlit as st
8
 
9
+ # Load IMDb dataset
10
+ dataset = load_dataset('imdb')
11
+ train_df = dataset['train'].to_pandas()
12
+ test_df = dataset['test'].to_pandas()
13
 
14
+ # Preprocess the data
15
+ train_df = train_df[['text', 'label']]
16
+ test_df = test_df[['text', 'label']]
 
 
17
 
18
  class SentimentDataset(Dataset):
19
  def __init__(self, dataframe, tokenizer, max_len):
 
110
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
111
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
112
 
 
 
 
 
113
  # Create data loaders
114
  BATCH_SIZE = 16
115
  MAX_LEN = 128