Spaces:

eaglelandsonce
/

pytorch

Running

App Files Files Community

pytorch / pages /21_NLP_Transformer.py

eaglelandsonce

Update pages/21_NLP_Transformer.py

6258c70 verified about 1 year ago

raw

history blame

5.65 kB

	import torch
	from torch.utils.data import DataLoader, Dataset
	from transformers import BertTokenizer, BertForSequenceClassification, AdamW
	from transformers import get_linear_schedule_with_warmup
	import numpy as np
	from datasets import load_dataset
	import streamlit as st

	# Load IMDb dataset
	dataset = load_dataset('imdb')
	train_df = dataset['train'].to_pandas()
	test_df = dataset['test'].to_pandas()

	# Preprocess the data
	train_df = train_df[['text', 'label']]
	test_df = test_df[['text', 'label']]

	class SentimentDataset(Dataset):
	def __init__(self, dataframe, tokenizer, max_len):
	self.tokenizer = tokenizer
	self.data = dataframe
	self.max_len = max_len

	def __len__(self):
	return len(self.data)

	def __getitem__(self, index):
	review = str(self.data.iloc[index, 0])
	label = self.data.iloc[index, 1]

	encoding = self.tokenizer.encode_plus(
	review,
	add_special_tokens=True,
	max_length=self.max_len,
	return_token_type_ids=False,
	pad_to_max_length=True,
	return_attention_mask=True,
	return_tensors='pt',
	)

	return {
	'review_text': review,
	'input_ids': encoding['input_ids'].flatten(),
	'attention_mask': encoding['attention_mask'].flatten(),
	'labels': torch.tensor(label, dtype=torch.long)
	}

	def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
	model = model.train()
	losses = []
	correct_predictions = 0

	for d in data_loader:
	input_ids = d["input_ids"].to(device)
	attention_mask = d["attention_mask"].to(device)
	labels = d["labels"].to(device)

	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask
	)

	loss = loss_fn(outputs.logits, labels)
	correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
	losses.append(loss.item())

	loss.backward()
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad()

	return correct_predictions.double() / n_examples, np.mean(losses)

	def eval_model(model, data_loader, loss_fn, device, n_examples):
	model = model.eval()
	losses = []
	correct_predictions = 0

	with torch.no_grad():
	for d in data_loader:
	input_ids = d["input_ids"].to(device)
	attention_mask = d["attention_mask"].to(device)
	labels = d["labels"].to(device)

	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask
	)

	loss = loss_fn(outputs.logits, labels)
	correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
	losses.append(loss.item())

	return correct_predictions.double() / n_examples, np.mean(losses)

	def create_data_loader(df, tokenizer, max_len, batch_size):
	ds = SentimentDataset(
	dataframe=df,
	tokenizer=tokenizer,
	max_len=max_len
	)

	return DataLoader(
	ds,
	batch_size=batch_size,
	num_workers=4
	)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

	# Create data loaders
	BATCH_SIZE = 16
	MAX_LEN = 128

	train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
	test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

	EPOCHS = 2
	optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
	total_steps = len(train_data_loader) * EPOCHS
	scheduler = get_linear_schedule_with_warmup(
	optimizer,
	num_warmup_steps=0,
	num_training_steps=total_steps
	)
	loss_fn = torch.nn.CrossEntropyLoss().to(device)
	model = model.to(device)

	# Training loop
	for epoch in range(EPOCHS):
	train_acc, train_loss = train_epoch(
	model,
	train_data_loader,
	loss_fn,
	optimizer,
	device,
	scheduler,
	len(train_df)
	)

	print(f'Epoch {epoch + 1}/{EPOCHS}')
	print(f'Train loss {train_loss} accuracy {train_acc}')

	val_acc, val_loss = eval_model(
	model,
	test_data_loader,
	loss_fn,
	device,
	len(test_df)
	)

	print(f'Val loss {val_loss} accuracy {val_acc}')

	# Save the model
	model.save_pretrained('bert-sentiment-model')
	tokenizer.save_pretrained('bert-sentiment-model')

	# Streamlit app
	model = BertForSequenceClassification.from_pretrained('bert-sentiment-model')
	tokenizer = BertTokenizer.from_pretrained('bert-sentiment-model')
	model = model.eval()

	def predict_sentiment(text):
	encoding = tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	max_length=128,
	return_token_type_ids=False,
	pad_to_max_length=True,
	return_attention_mask=True,
	return_tensors='pt',
	)
	input_ids = encoding['input_ids']
	attention_mask = encoding['attention_mask']

	with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask)
	probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
	predicted_class = torch.argmax(probabilities, dim=1).item()

	return 'positive' if predicted_class == 1 else 'negative'

	st.title("Sentiment Analysis with BERT")
	user_input = st.text_area("Enter a movie review:")

	if st.button("Analyze"):
	sentiment = predict_sentiment(user_input)
	st.write(f'The sentiment of the review is: {sentiment}')