Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import nltk | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from scipy.special import softmax | |
from tqdm.notebook import tqdm | |
import gradio as gr | |
# Download NLTK resources | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
# Load the model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment') | |
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment') | |
# Define a function to calculate polarity scores using RoBERTa | |
def polarity_scores_roberta(example): | |
encoded_text = tokenizer(example, return_tensors='pt') | |
output = model(**encoded_text) | |
scores = output[0][0].detach().numpy() | |
scores = softmax(scores) | |
scores_dict = { | |
'roberta_neg': scores[0], | |
'roberta_neu': scores[1], | |
'roberta_pos': scores[2] | |
} | |
return scores_dict | |
# Function to perform the analysis and return results | |
def analyze_reviews(): | |
df = pd.read_csv("Reviews.csv") | |
df = df.head(200) # Limiting to 200 for faster processing | |
# Plot the distribution of review scores | |
ax = df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5)) | |
ax.set_xlabel('Review Stars') | |
plt.tight_layout() | |
plt.savefig('review_distribution.png') | |
res = {} | |
for i, row in tqdm(df.iterrows(), total=len(df)): | |
try: | |
text = row['Text'] | |
myid = row['Id'] | |
roberta_result = polarity_scores_roberta(text) | |
res[myid] = roberta_result | |
except RuntimeError: | |
print(f'Broke for id {myid}') | |
results_df = pd.DataFrame(res).T | |
results_df = results_df.reset_index().rename(columns={'index': 'Id'}) | |
results_df = results_df.merge(df, how='left') | |
# Return plots and the DataFrame with results | |
return 'review_distribution.png', results_df.head().to_html(), results_df.query('Score == 1').sort_values('roberta_pos', ascending=False)['Text'].values[0], results_df.query('Score == 5').sort_values('roberta_neg', ascending=False)['Text'].values[0] | |
# Define the Gradio interface | |
def gradio_interface(): | |
csv_file = "sample_data/Reviews.csv" # Replace with the path to your CSV file | |
plot_path, table_html, pos_review, neg_review = analyze_reviews(csv_file) | |
return plot_path, table_html, pos_review, neg_review | |
gr.Interface( | |
fn=gradio_interface, | |
inputs=None, # or simply remove this line | |
outputs=[ | |
gr.Image(type="filepath", label="Review Distribution"), | |
gr.HTML(label="Sample Results DataFrame"), | |
gr.Textbox(label="Most Positive 1-Star Review"), | |
gr.Textbox(label="Most Negative 5-Star Review"), | |
], | |
title="Review Sentiment Analysis with RoBERTa", | |
description="Analyze sentiments in a preloaded CSV file of reviews using a RoBERTa model.", | |
).launch() | |