ya02's picture
Update app.py
3c9379d verified
raw
history blame
3 kB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm.notebook import tqdm
import gradio as gr
# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
# Define a function to calculate polarity scores using RoBERTa
def polarity_scores_roberta(example):
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
'roberta_neg': scores[0],
'roberta_neu': scores[1],
'roberta_pos': scores[2]
}
return scores_dict
# Function to perform the analysis and return results
def analyze_reviews():
df = pd.read_csv("Reviews.csv")
df = df.head(200) # Limiting to 200 for faster processing
# Plot the distribution of review scores
ax = df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.tight_layout()
plt.savefig('review_distribution.png')
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
try:
text = row['Text']
myid = row['Id']
roberta_result = polarity_scores_roberta(text)
res[myid] = roberta_result
except RuntimeError:
print(f'Broke for id {myid}')
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')
# Return plots and the DataFrame with results
return 'review_distribution.png', results_df.head().to_html(), results_df.query('Score == 1').sort_values('roberta_pos', ascending=False)['Text'].values[0], results_df.query('Score == 5').sort_values('roberta_neg', ascending=False)['Text'].values[0]
# Define the Gradio interface
def gradio_interface():
csv_file = "sample_data/Reviews.csv" # Replace with the path to your CSV file
plot_path, table_html, pos_review, neg_review = analyze_reviews(csv_file)
return plot_path, table_html, pos_review, neg_review
gr.Interface(
fn=gradio_interface,
inputs=None, # or simply remove this line
outputs=[
gr.Image(type="filepath", label="Review Distribution"),
gr.HTML(label="Sample Results DataFrame"),
gr.Textbox(label="Most Positive 1-Star Review"),
gr.Textbox(label="Most Negative 5-Star Review"),
],
title="Review Sentiment Analysis with RoBERTa",
description="Analyze sentiments in a preloaded CSV file of reviews using a RoBERTa model.",
).launch()