pytorch / pages /21_NLP_Transformer.py
eaglelandsonce's picture
Update pages/21_NLP_Transformer.py
6258c70 verified
raw
history blame
5.65 kB
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from datasets import load_dataset
import streamlit as st
# Load IMDb dataset
dataset = load_dataset('imdb')
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()
# Preprocess the data
train_df = train_df[['text', 'label']]
test_df = test_df[['text', 'label']]
class SentimentDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.max_len = max_len
def __len__(self):
return len(self.data)
def __getitem__(self, index):
review = str(self.data.iloc[index, 0])
label = self.data.iloc[index, 1]
encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
labels = d["labels"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
loss = loss_fn(outputs.logits, labels)
correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
losses.append(loss.item())
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
labels = d["labels"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
loss = loss_fn(outputs.logits, labels)
correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = SentimentDataset(
dataframe=df,
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# Create data loaders
BATCH_SIZE = 16
MAX_LEN = 128
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
EPOCHS = 2
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = torch.nn.CrossEntropyLoss().to(device)
model = model.to(device)
# Training loop
for epoch in range(EPOCHS):
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(train_df)
)
print(f'Epoch {epoch + 1}/{EPOCHS}')
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
test_data_loader,
loss_fn,
device,
len(test_df)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
# Save the model
model.save_pretrained('bert-sentiment-model')
tokenizer.save_pretrained('bert-sentiment-model')
# Streamlit app
model = BertForSequenceClassification.from_pretrained('bert-sentiment-model')
tokenizer = BertTokenizer.from_pretrained('bert-sentiment-model')
model = model.eval()
def predict_sentiment(text):
encoding = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=128,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
predicted_class = torch.argmax(probabilities, dim=1).item()
return 'positive' if predicted_class == 1 else 'negative'
st.title("Sentiment Analysis with BERT")
user_input = st.text_area("Enter a movie review:")
if st.button("Analyze"):
sentiment = predict_sentiment(user_input)
st.write(f'The sentiment of the review is: **{sentiment}**')