Ahtisham1583 commited on
Commit
256d027
·
verified ·
1 Parent(s): 70142f1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import re
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ from nltk.tokenize import word_tokenize
10
+ from nltk.stem import SnowballStemmer
11
+ from sklearn.feature_extraction.text import CountVectorizer
12
+ from scipy.cluster.hierarchy import dendrogram, ward
13
+ from scipy.sparse import csr_matrix
14
+
15
+ # Download NLTK resources
16
+ nltk.download('punkt')
17
+ nltk.download('stopwords')
18
+
19
+ def preprocess_text(text):
20
+ # Convert text to lowercase
21
+ text = text.lower()
22
+ # Remove URLs
23
+ text = re.sub(r'http\S+', '', text)
24
+ # Remove @ mentions
25
+ text = re.sub(r'@\S+', '', text)
26
+ # Remove hashtags
27
+ text = re.sub(r'#\S+', '', text)
28
+ # Tokenize
29
+ tokens = word_tokenize(text)
30
+ # Remove stopwords
31
+ stop_words = set(stopwords.words('english'))
32
+ filtered_tokens = [word for word in tokens if word not in stop_words]
33
+ # Stemming
34
+ stemmer = SnowballStemmer('english')
35
+ stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
36
+ # Join tokens into a single string
37
+ processed_text = ' '.join(stemmed_tokens)
38
+ return processed_text
39
+
40
+ def preprocess_and_plot(data):
41
+ # Preprocess the text
42
+ data['clean_text'] = data['text'].apply(preprocess_text)
43
+
44
+ # CountVectorizer
45
+ vectorizer = CountVectorizer()
46
+ dtm = vectorizer.fit_transform(data['clean_text'])
47
+
48
+ # Convert DTM to DataFrame
49
+ tweets = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
50
+ tweets['airline_sentiment'] = data['airline_sentiment']
51
+
52
+ # Plot histogram of global sentiment
53
+ overall_sentiment = tweets['airline_sentiment'].value_counts().reset_index()
54
+ overall_sentiment.columns = ['Sentiment', 'Freq']
55
+ sns.barplot(data=overall_sentiment, x='Sentiment', y='Freq', palette=['indianred', 'deepskyblue', 'chartreuse'])
56
+ plt.title('Summary Global Sentiment')
57
+ plt.xlabel('Sentiment')
58
+ plt.ylabel('Frequency')
59
+ plt.show()
60
+
61
+ # Plot sentiment distribution for each airline
62
+ airline_sentiment = tweets.groupby(['airline', 'airline_sentiment']).size().reset_index(name='Freq')
63
+ plt.figure(figsize=(10, 6))
64
+ sns.barplot(data=airline_sentiment, x='airline', y='Freq', hue='airline_sentiment', palette=['indianred', 'deepskyblue', 'chartreuse'])
65
+ plt.title('Number of Tweets and Sentiment for Each Airline')
66
+ plt.xlabel('Airline')
67
+ plt.ylabel('Frequency')
68
+ plt.legend(title='Sentiment')
69
+ plt.xticks(rotation=45)
70
+ plt.show()
71
+
72
+ def plot_pie_chart(data):
73
+ def plot_airline_sentiment_pie(data, airline_name):
74
+ subset = data[data['airline'] == airline_name]
75
+ sentiment_counts = subset['airline_sentiment'].value_counts()
76
+ colors = ['indianred', 'deepskyblue', 'chartreuse']
77
+ plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
78
+ plt.title(f'{airline_name} Sentiment Distribution')
79
+ plt.show()
80
+
81
+ airlines = data['airline'].unique()
82
+ for airline in airlines:
83
+ plot_airline_sentiment_pie(data, airline)
84
+
85
+ def plot_reasons(data):
86
+ reason_table = pd.crosstab(data['negativereason'], data['airline'])
87
+ globalSentReasons = reason_table.reset_index().melt(id_vars='negativereason', var_name='Airline', value_name='Freq')
88
+ total_by_airline = globalSentReasons.groupby('Airline')['Freq'].sum().reset_index()
89
+ total_by_airline.columns = ['Airline', 'Total']
90
+ globalSentReasons = globalSentReasons.merge(total_by_airline, on='Airline', how='left')
91
+ globalSentReasons['PercentOfTotal'] = (globalSentReasons['Freq'] / globalSentReasons['Total']) * 100
92
+
93
+ plt.figure(figsize=(12, 8))
94
+ sns.barplot(data=globalSentReasons, x='negativereason', y='PercentOfTotal', hue='Airline')
95
+ plt.xticks(rotation=45)
96
+ plt.title('Percentage of Total Complaints by Reason and Airline')
97
+ plt.ylabel('Percentage of Total Complaints')
98
+ plt.xlabel('Reason for Complaint')
99
+ plt.legend(title='Airline', bbox_to_anchor=(1.05, 1), loc='upper left')
100
+ plt.tight_layout()
101
+ plt.show()
102
+
103
+ def plot_word_frequency(data):
104
+ def count_word_freq(text):
105
+ vectorizer = CountVectorizer()
106
+ dtm = vectorizer.fit_transform(text)
107
+ words = vectorizer.get_feature_names_out()
108
+ freq = dtm.sum(axis=0)
109
+ return pd.DataFrame({'word': words, 'freq': freq.tolist()[0]})
110
+
111
+ words_freq = count_word_freq(data['clean_text'])
112
+ wf_filtered = words_freq[words_freq['freq'] > 50].sort_values(by='freq', ascending=False)
113
+ plt.figure(figsize=(12, 6))
114
+ plt.bar(wf_filtered['word'], wf_filtered['freq'], color='skyblue')
115
+ plt.xlabel('Word', fontsize=12)
116
+ plt.ylabel('Frequency', fontsize=12)
117
+ plt.title('Word Frequency', fontsize=14)
118
+ plt.xticks(rotation=45, ha='right', fontsize=10)
119
+ plt.yticks(fontsize=10)
120
+ plt.show()
121
+
122
+ def plot_dendrogram(data):
123
+ vectorizer = CountVectorizer()
124
+ dtm = vectorizer.fit_transform(data['clean_text'])
125
+
126
+ # Convert dtm to a dense matrix
127
+ dense_dtm = dtm.toarray()
128
+
129
+ # Remove sparse terms
130
+ min_occurrences = 3
131
+ non_sparse_cols = (dense_dtm.sum(axis=0) >= min_occurrences).ravel()
132
+ sparse = dense_dtm[:, non_sparse_cols]
133
+
134
+ # Calculate the distance matrix
135
+ dist = ward(sparse.T)
136
+
137
+ # Plot the dendrogram
138
+ plt.figure(figsize=(15, 6))
139
+ dendrogram(dist, leaf_rotation=90, leaf_font_size=10, labels=vectorizer.get_feature_names_out()[non_sparse_cols])
140
+ plt.xlabel('Terms')
141
+ plt.ylabel('Distance')
142
+ plt.title('Dendrogram')
143
+ plt.show()
144
+
145
+ # Create the interface
146
+ iface = gr.Interface(preprocess_and_plot,
147
+ inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
148
+ outputs=None,
149
+ title="Social Media Trend Analysis",
150
+ description="Analyze sentiment and trends in your social media data.")
151
+ iface.launch()