Spaces:
Running
Running
import torch | |
from torch.utils.data import DataLoader, Dataset | |
from transformers import BertTokenizer, BertForSequenceClassification, AdamW | |
from transformers import get_linear_schedule_with_warmup | |
import numpy as np | |
from datasets import load_dataset | |
import streamlit as st | |
# Load IMDb dataset | |
dataset = load_dataset('imdb') | |
train_df = dataset['train'].to_pandas() | |
test_df = dataset['test'].to_pandas() | |
# Preprocess the data | |
train_df = train_df[['text', 'label']] | |
test_df = test_df[['text', 'label']] | |
class SentimentDataset(Dataset): | |
def __init__(self, dataframe, tokenizer, max_len): | |
self.tokenizer = tokenizer | |
self.data = dataframe | |
self.max_len = max_len | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, index): | |
review = str(self.data.iloc[index, 0]) | |
label = self.data.iloc[index, 1] | |
encoding = self.tokenizer.encode_plus( | |
review, | |
add_special_tokens=True, | |
max_length=self.max_len, | |
return_token_type_ids=False, | |
pad_to_max_length=True, | |
return_attention_mask=True, | |
return_tensors='pt', | |
) | |
return { | |
'review_text': review, | |
'input_ids': encoding['input_ids'].flatten(), | |
'attention_mask': encoding['attention_mask'].flatten(), | |
'labels': torch.tensor(label, dtype=torch.long) | |
} | |
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): | |
model = model.train() | |
losses = [] | |
correct_predictions = 0 | |
for d in data_loader: | |
input_ids = d["input_ids"].to(device) | |
attention_mask = d["attention_mask"].to(device) | |
labels = d["labels"].to(device) | |
outputs = model( | |
input_ids=input_ids, | |
attention_mask=attention_mask | |
) | |
loss = loss_fn(outputs.logits, labels) | |
correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels) | |
losses.append(loss.item()) | |
loss.backward() | |
optimizer.step() | |
scheduler.step() | |
optimizer.zero_grad() | |
return correct_predictions.double() / n_examples, np.mean(losses) | |
def eval_model(model, data_loader, loss_fn, device, n_examples): | |
model = model.eval() | |
losses = [] | |
correct_predictions = 0 | |
with torch.no_grad(): | |
for d in data_loader: | |
input_ids = d["input_ids"].to(device) | |
attention_mask = d["attention_mask"].to(device) | |
labels = d["labels"].to(device) | |
outputs = model( | |
input_ids=input_ids, | |
attention_mask=attention_mask | |
) | |
loss = loss_fn(outputs.logits, labels) | |
correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels) | |
losses.append(loss.item()) | |
return correct_predictions.double() / n_examples, np.mean(losses) | |
def create_data_loader(df, tokenizer, max_len, batch_size): | |
ds = SentimentDataset( | |
dataframe=df, | |
tokenizer=tokenizer, | |
max_len=max_len | |
) | |
return DataLoader( | |
ds, | |
batch_size=batch_size, | |
num_workers=4 | |
) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
model = BertForSequenceClassification.from_pretrained('bert-base-uncased') | |
# Create data loaders | |
BATCH_SIZE = 16 | |
MAX_LEN = 128 | |
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE) | |
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE) | |
EPOCHS = 2 | |
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) | |
total_steps = len(train_data_loader) * EPOCHS | |
scheduler = get_linear_schedule_with_warmup( | |
optimizer, | |
num_warmup_steps=0, | |
num_training_steps=total_steps | |
) | |
loss_fn = torch.nn.CrossEntropyLoss().to(device) | |
model = model.to(device) | |
# Training loop | |
for epoch in range(EPOCHS): | |
train_acc, train_loss = train_epoch( | |
model, | |
train_data_loader, | |
loss_fn, | |
optimizer, | |
device, | |
scheduler, | |
len(train_df) | |
) | |
print(f'Epoch {epoch + 1}/{EPOCHS}') | |
print(f'Train loss {train_loss} accuracy {train_acc}') | |
val_acc, val_loss = eval_model( | |
model, | |
test_data_loader, | |
loss_fn, | |
device, | |
len(test_df) | |
) | |
print(f'Val loss {val_loss} accuracy {val_acc}') | |
# Save the model | |
model.save_pretrained('bert-sentiment-model') | |
tokenizer.save_pretrained('bert-sentiment-model') | |
# Streamlit app | |
model = BertForSequenceClassification.from_pretrained('bert-sentiment-model') | |
tokenizer = BertTokenizer.from_pretrained('bert-sentiment-model') | |
model = model.eval() | |
def predict_sentiment(text): | |
encoding = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=128, | |
return_token_type_ids=False, | |
pad_to_max_length=True, | |
return_attention_mask=True, | |
return_tensors='pt', | |
) | |
input_ids = encoding['input_ids'] | |
attention_mask = encoding['attention_mask'] | |
with torch.no_grad(): | |
outputs = model(input_ids, attention_mask=attention_mask) | |
probabilities = torch.nn.functional.softmax(outputs.logits, dim=1) | |
predicted_class = torch.argmax(probabilities, dim=1).item() | |
return 'positive' if predicted_class == 1 else 'negative' | |
st.title("Sentiment Analysis with BERT") | |
user_input = st.text_area("Enter a movie review:") | |
if st.button("Analyze"): | |
sentiment = predict_sentiment(user_input) | |
st.write(f'The sentiment of the review is: **{sentiment}**') | |