ya02 commited on
Commit
e1c6483
·
verified ·
1 Parent(s): 7d12038

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import nltk
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ from scipy.special import softmax
8
+ from tqdm.notebook import tqdm
9
+ import gradio as gr
10
+
11
+ # Download NLTK resources
12
+ nltk.download('punkt')
13
+ nltk.download('averaged_perceptron_tagger')
14
+
15
+ # Load the model and tokenizer
16
+ tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
17
+ model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
18
+
19
+ # Define a function to calculate polarity scores using RoBERTa
20
+ def polarity_scores_roberta(example):
21
+ encoded_text = tokenizer(example, return_tensors='pt')
22
+ output = model(**encoded_text)
23
+ scores = output[0][0].detach().numpy()
24
+ scores = softmax(scores)
25
+ scores_dict = {
26
+ 'roberta_neg': scores[0],
27
+ 'roberta_neu': scores[1],
28
+ 'roberta_pos': scores[2]
29
+ }
30
+ return scores_dict
31
+
32
+ # Function to perform the analysis and return results
33
+ def analyze_reviews(csv_file):
34
+ df = pd.read_csv(csv_file)
35
+ df = df.head(200) # Limiting to 200 for faster processing
36
+
37
+ # Plot the distribution of review scores
38
+ ax = df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5))
39
+ ax.set_xlabel('Review Stars')
40
+ plt.tight_layout()
41
+ plt.savefig('review_distribution.png')
42
+
43
+ res = {}
44
+ for i, row in tqdm(df.iterrows(), total=len(df)):
45
+ try:
46
+ text = row['Text']
47
+ myid = row['Id']
48
+ roberta_result = polarity_scores_roberta(text)
49
+ res[myid] = roberta_result
50
+ except RuntimeError:
51
+ print(f'Broke for id {myid}')
52
+
53
+ results_df = pd.DataFrame(res).T
54
+ results_df = results_df.reset_index().rename(columns={'index': 'Id'})
55
+ results_df = results_df.merge(df, how='left')
56
+
57
+ # Return plots and the DataFrame with results
58
+ return 'review_distribution.png', results_df.head().to_html(), results_df.query('Score == 1').sort_values('roberta_pos', ascending=False)['Text'].values[0], results_df.query('Score == 5').sort_values('roberta_neg', ascending=False)['Text'].values[0]
59
+
60
+ # Define the Gradio interface
61
+ def gradio_interface():
62
+ csv_file = "sample_data/Reviews.csv" # Replace with the path to your CSV file
63
+ plot_path, table_html, pos_review, neg_review = analyze_reviews(csv_file)
64
+ return plot_path, table_html, pos_review, neg_review
65
+
66
+ gr.Interface(
67
+ fn=gradio_interface,
68
+ inputs=None, # or simply remove this line
69
+ outputs=[
70
+ gr.Image(type="filepath", label="Review Distribution"),
71
+ gr.HTML(label="Sample Results DataFrame"),
72
+ gr.Textbox(label="Most Positive 1-Star Review"),
73
+ gr.Textbox(label="Most Negative 5-Star Review"),
74
+ ],
75
+ title="Review Sentiment Analysis with RoBERTa",
76
+ description="Analyze sentiments in a preloaded CSV file of reviews using a RoBERTa model.",
77
+ ).launch()