Spaces:

ya02
/

roberta_sentiment

Sleeping

App Files Files Community

roberta_sentiment / app.py

ya02

Update app.py

3c9379d verified 11 months ago

raw

history blame

3 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import nltk
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from scipy.special import softmax
	from tqdm.notebook import tqdm
	import gradio as gr

	# Download NLTK resources
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')

	# Load the model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
	model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

	# Define a function to calculate polarity scores using RoBERTa
	def polarity_scores_roberta(example):
	encoded_text = tokenizer(example, return_tensors='pt')
	output = model(**encoded_text)
	scores = output[0][0].detach().numpy()
	scores = softmax(scores)
	scores_dict = {
	'roberta_neg': scores[0],
	'roberta_neu': scores[1],
	'roberta_pos': scores[2]
	}
	return scores_dict

	# Function to perform the analysis and return results
	def analyze_reviews():
	df = pd.read_csv("Reviews.csv")
	df = df.head(200) # Limiting to 200 for faster processing

	# Plot the distribution of review scores
	ax = df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5))
	ax.set_xlabel('Review Stars')
	plt.tight_layout()
	plt.savefig('review_distribution.png')

	res = {}
	for i, row in tqdm(df.iterrows(), total=len(df)):
	try:
	text = row['Text']
	myid = row['Id']
	roberta_result = polarity_scores_roberta(text)
	res[myid] = roberta_result
	except RuntimeError:
	print(f'Broke for id {myid}')

	results_df = pd.DataFrame(res).T
	results_df = results_df.reset_index().rename(columns={'index': 'Id'})
	results_df = results_df.merge(df, how='left')

	# Return plots and the DataFrame with results
	return 'review_distribution.png', results_df.head().to_html(), results_df.query('Score == 1').sort_values('roberta_pos', ascending=False)['Text'].values[0], results_df.query('Score == 5').sort_values('roberta_neg', ascending=False)['Text'].values[0]

	# Define the Gradio interface
	def gradio_interface():
	csv_file = "sample_data/Reviews.csv" # Replace with the path to your CSV file
	plot_path, table_html, pos_review, neg_review = analyze_reviews(csv_file)
	return plot_path, table_html, pos_review, neg_review

	gr.Interface(
	fn=gradio_interface,
	inputs=None, # or simply remove this line
	outputs=[
	gr.Image(type="filepath", label="Review Distribution"),
	gr.HTML(label="Sample Results DataFrame"),
	gr.Textbox(label="Most Positive 1-Star Review"),
	gr.Textbox(label="Most Negative 5-Star Review"),
	],
	title="Review Sentiment Analysis with RoBERTa",
	description="Analyze sentiments in a preloaded CSV file of reviews using a RoBERTa model.",
	).launch()